plan7.pyx 254 KB
Newer Older
1
2
# coding: utf-8
# cython: language_level=3, linetrace=True
3
4
5
6
7
8
9
10
11
"""High-level interface to the Plan7 data model.

Plan7 is the model architecture used by HMMER since HMMER2.

See Also:
    Details about the Plan 7 architecture in the `HMMER documentation
    <http://www.csb.yale.edu/userguides/seq/hmmer/docs/node11.html>`_.

"""
12

13
14
# --- C imports --------------------------------------------------------------

15
cimport cython
16
from cpython.bytes cimport PyBytes_FromStringAndSize
17
from cpython.list cimport PyList_New, PyList_SET_ITEM
18
from cpython.ref cimport PyObject
19
from cpython.exc cimport PyErr_Clear
20
21
from cpython.unicode cimport PyUnicode_DecodeASCII
from libc.math cimport exp, ceil
22
from libc.stddef cimport ptrdiff_t
23
from libc.stdio cimport printf
24
from libc.stdlib cimport calloc, malloc, realloc, free
25
from libc.stdint cimport uint8_t, uint32_t, int64_t
26
from libc.stdio cimport fprintf, FILE, stdout, fclose
27
from libc.string cimport memset, memcpy, memmove, strdup, strndup, strlen, strcmp, strncpy
28
29
from libc.time cimport ctime, strftime, time, time_t, tm, localtime_r
from unicode cimport PyUnicode_DATA, PyUnicode_KIND, PyUnicode_READ, PyUnicode_READY, PyUnicode_GET_LENGTH
30

31
cimport libeasel
32
cimport libeasel.sq
33
cimport libeasel.alphabet
34
cimport libeasel.dmatrix
35
cimport libeasel.fileparser
36
cimport libeasel.random
37
cimport libeasel.scorematrix
38
cimport libeasel.getopts
39
cimport libeasel.vec
40
cimport libhmmer
41
cimport libhmmer.modelconfig
42
cimport libhmmer.modelstats
43
cimport libhmmer.p7_alidisplay
44
cimport libhmmer.p7_hmm
45
cimport libhmmer.p7_builder
46
cimport libhmmer.p7_bg
47
cimport libhmmer.p7_domaindef
48
cimport libhmmer.p7_hit
49
50
cimport libhmmer.p7_hmmfile
cimport libhmmer.p7_pipeline
51
cimport libhmmer.p7_prior
52
cimport libhmmer.p7_profile
53
cimport libhmmer.p7_scoredata
54
cimport libhmmer.p7_tophits
55
56
cimport libhmmer.p7_trace
cimport libhmmer.tracealign
57
from libeasel cimport eslERRBUFSIZE, eslCONST_LOG2R
58
from libeasel.alphabet cimport ESL_ALPHABET, esl_alphabet_Create, esl_abc_ValidateType
59
60
from libeasel.getopts cimport ESL_GETOPTS, ESL_OPTIONS
from libeasel.sq cimport ESL_SQ
61
from libeasel.keyhash cimport ESL_KEYHASH
62
from libeasel.fileparser cimport ESL_FILEPARSER
63
from libhmmer cimport p7_LOCAL, p7_EVPARAM_UNSET, p7_CUTOFF_UNSET, p7_NEVPARAM, p7_NCUTOFFS, p7_offsets_e, p7_cutoffs_e, p7_evparams_e
64
from libhmmer.logsum cimport p7_FLogsumInit
65
from libhmmer.p7_builder cimport P7_BUILDER, p7_archchoice_e, p7_wgtchoice_e, p7_effnchoice_e
66
from libhmmer.p7_hmm cimport p7H_NTRANSITIONS, p7H_TC, p7H_GA, p7H_NC, p7H_MAP
67
from libhmmer.p7_hmmfile cimport p7_hmmfile_formats_e
68
from libhmmer.p7_hit cimport p7_hitflags_e, P7_HIT
69
from libhmmer.p7_alidisplay cimport P7_ALIDISPLAY
70
from libhmmer.p7_pipeline cimport P7_PIPELINE, p7_pipemodes_e, p7_zsetby_e, p7_strands_e, p7_complementarity_e
71
from libhmmer.p7_profile cimport p7_LOCAL, p7_GLOCAL, p7_UNILOCAL, p7_UNIGLOCAL
72
from libhmmer.p7_trace cimport P7_TRACE, p7t_statetype_e
73

74
IF HMMER_IMPL == "VMX":
75
    from libhmmer.impl_vmx cimport p7_oprofile, p7_omx, impl_Init
76
    from libhmmer.impl_vmx.io cimport p7_oprofile_Write, p7_oprofile_ReadMSV, p7_oprofile_ReadRest
77
78
79
80
81
82
83
    from libhmmer.impl_vmx.p7_oprofile cimport (
        P7_OPROFILE,
        p7O_NQB,
        p7_oprofile_Compare,
        p7_oprofile_Dump,
        p7_oprofile_Sizeof,
    )
84
ELIF HMMER_IMPL == "SSE":
85
    from libhmmer.impl_sse cimport p7_oprofile, p7_omx, impl_Init, p7_SSVFilter, p7O_EXTRA_SB
86
    from libhmmer.impl_sse.io cimport p7_oprofile_Write, p7_oprofile_ReadMSV, p7_oprofile_ReadRest
87
88
89
90
91
92
93
    from libhmmer.impl_sse.p7_oprofile cimport (
        P7_OPROFILE,
        p7O_NQB,
        p7_oprofile_Compare,
        p7_oprofile_Dump,
        p7_oprofile_Sizeof,
    )
94

95
96
97
98
from .easel cimport (
    Alphabet,
    Sequence,
    DigitalSequence,
99
    KeyHash,
100
101
102
103
    MSA,
    TextMSA,
    DigitalMSA,
    VectorF,
104
105
    MatrixF,
    MatrixU8,
106
)
107
from .reexports.p7_tophits cimport p7_tophits_Reuse
108
109
110
111
112
113
114
115
116
117
118
from .reexports.p7_hmmfile cimport (
    read_asc20hmm,
    read_asc30hmm,
    read_bin30hmm,
    v3a_magic,
    v3b_magic,
    v3c_magic,
    v3d_magic,
    v3e_magic,
    v3f_magic
)
119

120
121
122
123
124
IF UNAME_SYSNAME == "Linux":
    include "fileobj/linux.pxi"
ELIF UNAME_SYSNAME == "Darwin" or UNAME_SYSNAME.endswith("BSD"):
    include "fileobj/bsd.pxi"

125
126
# --- Python imports ---------------------------------------------------------

127
import array
128
import collections.abc
129
import datetime
130
import errno
131
import math
132
import io
133
import itertools
134
135
import os
import sys
136
137
import warnings

138
from .errors import AllocationError, UnexpectedError, AlphabetMismatch
139
from .utils import peekable
140
141


142
143
# --- Constants --------------------------------------------------------------

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
cdef dict BUILDER_ARCHITECTURE_STRATEGY = {
    "fast": p7_archchoice_e.p7_ARCH_FAST,
    "hand": p7_archchoice_e.p7_ARCH_HAND,
}

cdef dict BUILDER_WEIGHTING_STRATEGY = {
    "pb": p7_wgtchoice_e.p7_WGT_PB,
    "gsc": p7_wgtchoice_e.p7_WGT_GSC,
    "blosum": p7_wgtchoice_e.p7_WGT_BLOSUM,
    "none": p7_wgtchoice_e.p7_WGT_NONE,
    "given": p7_wgtchoice_e.p7_WGT_GIVEN,
}

cdef dict BUILDER_EFFECTIVE_STRATEGY = {
    "entropy": p7_effnchoice_e.p7_EFFN_ENTROPY,
    "exp": p7_effnchoice_e.p7_EFFN_ENTROPY_EXP,
    "clust": p7_effnchoice_e.p7_EFFN_CLUST,
    "none": p7_effnchoice_e.p7_EFFN_NONE,
}

164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
cdef dict HMM_FILE_FORMATS = {
    "2.0": p7_hmmfile_formats_e.p7_HMMFILE_20,
    "3/a": p7_hmmfile_formats_e.p7_HMMFILE_3a,
    "3/b": p7_hmmfile_formats_e.p7_HMMFILE_3b,
    "3/c": p7_hmmfile_formats_e.p7_HMMFILE_3c,
    "3/d": p7_hmmfile_formats_e.p7_HMMFILE_3d,
    "3/e": p7_hmmfile_formats_e.p7_HMMFILE_3e,
    "3/f": p7_hmmfile_formats_e.p7_HMMFILE_3f,
}

cdef dict HMM_FILE_MAGIC = {
    v3a_magic: p7_hmmfile_formats_e.p7_HMMFILE_3a,
    v3b_magic: p7_hmmfile_formats_e.p7_HMMFILE_3b,
    v3c_magic: p7_hmmfile_formats_e.p7_HMMFILE_3c,
    v3d_magic: p7_hmmfile_formats_e.p7_HMMFILE_3d,
    v3e_magic: p7_hmmfile_formats_e.p7_HMMFILE_3e,
    v3f_magic: p7_hmmfile_formats_e.p7_HMMFILE_3f,
}

183
184
185
186
187
cdef dict PIPELINE_BIT_CUTOFFS = {
    "gathering": libhmmer.p7_hmm.p7H_GA,
    "noise": libhmmer.p7_hmm.p7H_NC,
    "trusted": libhmmer.p7_hmm.p7H_TC,
}
188

189
# --- Cython classes ---------------------------------------------------------
190

191
192

cdef class Alignment:
193
194
195
196
197
    """An alignment of a sequence to a profile.

    Attributes:
        domain (`Domain`): The domain this alignment corresponds to.

198
    """
199
200
201
202
203
204
205

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self, Domain domain):
        self.domain = domain
        self._ad = domain._dom.ad

206
207
208
    def __len__(self):
        return self.hmm_to - self.hmm_from

209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
    def __str__(self):
        assert self._ad != NULL

        cdef int    status
        cdef object buffer = io.BytesIO()
        cdef FILE*  fp     = fopen_obj(buffer, "w")

        try:
            status = libhmmer.p7_alidisplay.p7_nontranslated_alidisplay_Print(
                fp,
                self._ad,
                0,
                -1,
                False,
            )
            if status == libeasel.eslEWRITE:
                raise OSError("Failed to write alignment")
            elif status != libeasel.eslOK:
                raise UnexpectedError(status, "p7_alidisplay_Print")
        finally:
            fclose(fp)

        return buffer.getvalue().decode("ascii")

233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
    cpdef dict __getstate__(self):
        assert self._ad != NULL

        cdef dict      state
        cdef ptrdiff_t ptr
        cdef str       attr
        cdef bytearray mem   = bytearray(self._ad.memsize)
        cdef char[::1] view  = mem

        # copy numerical attributes
        state = {
            "N": self._ad.N,
            "hmmfrom": self._ad.hmmfrom,
            "hmmto": self._ad.hmmto,
            "M": self._ad.M,
            "sqfrom": self._ad.sqfrom,
            "sqto": self._ad.sqto,
            "L": self._ad.L,
            "memsize": self._ad.memsize,
        }

        # copy allocated memory contents
        memcpy(&view[0], self._ad.mem, self._ad.memsize*sizeof(char))
        state["mem"] = mem

        # copy pointers to allocated memory
        for attr, ptr in [
            ("rfline", <ptrdiff_t> self._ad.rfline),
            ("mmline", <ptrdiff_t> self._ad.mmline),
            ("csline", <ptrdiff_t> self._ad.csline),
            ("model", <ptrdiff_t> self._ad.model),
            ("mline", <ptrdiff_t> self._ad.mline),
            ("aseq", <ptrdiff_t> self._ad.aseq),
            ("ntseq", <ptrdiff_t> self._ad.ntseq),
            ("ppline", <ptrdiff_t> self._ad.ppline),
            ("hmmname", <ptrdiff_t> self._ad.hmmname),
            ("hmmacc", <ptrdiff_t> self._ad.hmmacc),
            ("hmmdesc", <ptrdiff_t> self._ad.hmmdesc),
            ("sqname", <ptrdiff_t> self._ad.sqname),
            ("sqacc", <ptrdiff_t> self._ad.sqacc),
            ("sqdesc", <ptrdiff_t> self._ad.sqdesc),
        ]:
            if <void*> ptr == NULL:
                state[attr] = None
            else:
                assert <ptrdiff_t> self._ad.mem <= ptr <= (<ptrdiff_t> self._ad.mem + self._ad.memsize)
                state[attr] = ptr - <ptrdiff_t> self._ad.mem

        return state
282

283
284
285
286
    # --- Properties ---------------------------------------------------------

    @property
    def hmm_from(self):
287
288
        """`int`: The start coordinate of the alignment in the query HMM.
        """
289
        assert self._ad != NULL
290
291
292
293
        return self._ad.hmmfrom

    @property
    def hmm_to(self):
294
295
        """`int`: The end coordinate of the alignment in the query HMM.
        """
296
        assert self._ad != NULL
297
298
299
300
        return self._ad.hmmto

    @property
    def hmm_name(self):
301
302
        """`bytes`: The name of the query HMM.
        """
303
304
        assert self._ad != NULL
        assert self._ad.hmmname != NULL
305
306
        return <bytes> self._ad.hmmname

307
308
    @property
    def hmm_accession(self):
309
        """`bytes`: The accession of the query, or its name if it has none.
310
311
312

        .. versionadded:: 0.1.4

313
        """
314
315
        assert self._ad != NULL
        assert self._ad.hmmacc != NULL
316
317
        return <bytes> self._ad.hmmacc

318
319
    @property
    def hmm_sequence(self):
320
321
        """`str`: The sequence of the query HMM in the alignment.
        """
322
        assert self._ad != NULL
323
324
325
326
        return self._ad.model.decode('ascii')

    @property
    def target_from(self):
327
328
        """`int`: The start coordinate of the alignment in the target sequence.
        """
329
        assert self._ad != NULL
330
331
332
333
        return self._ad.sqfrom

    @property
    def target_name(self):
334
335
        """`bytes`: The name of the target sequence.
        """
336
337
        assert self._ad != NULL
        assert self._ad.sqname != NULL
338
339
340
341
        return <bytes> self._ad.sqname

    @property
    def target_sequence(self):
342
343
        """`str`: The sequence of the target sequence in the alignment.
        """
344
        assert self._ad != NULL
345
346
347
348
        return self._ad.aseq.decode('ascii')

    @property
    def target_to(self):
349
350
        """`int`: The end coordinate of the alignment in the target sequence.
        """
351
        assert self._ad != NULL
352
353
354
355
        return self._ad.sqto

    @property
    def identity_sequence(self):
356
357
        """`str`: The identity sequence between the query and the target.
        """
358
        assert self._ad != NULL
359
360
361
        return self._ad.mline.decode('ascii')


362
363
cdef class Background:
    """The null background model of HMMER.
364
365
366
367
368
369

    Attributes:
        alphabet (`~pyhmmer.easel.Alphabet`): The alphabet of the
            backgound model.
        uniform (`bool`): Whether or not the null model has been created
            with uniform frequencies.
370
371
        residue_frequencies (`~pyhmmer.easel.VectorF`): The null1 background
            residue frequencies.
372

373
    .. versionchanged:: 0.4.0
374
375
       Added the `residue_frequencies` attribute.

376
377
378
379
380
381
    """

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self):
        self._bg = NULL
382
        self._L  = 350
383
        self.alphabet = None
384
        self.residue_frequencies = None
385
        self.uniform = False
386
387

    def __init__(self, Alphabet alphabet, bint uniform=False):
388
389
390
        """__init__(self, alphabet, uniform=False)\n--

        Create a new background model for the given ``alphabet``.
391
392
393
394
395
396
397
398

        Arguments:
          alphabet (`pyhmmer.easel.Alphabet`): The alphabet to create the
              background model with.
          uniform (`bool`): Whether or not to create the null model with
              uniform frequencies. Defaults to `False`.

        """
399
        # store the alphabet so it's not deallocated
400
        self.alphabet = alphabet
401
        # store whether or not the null model has uniform frequencies
402
        self.uniform = uniform
403
        # create the background profile
404
405
406
407
408
        with nogil:
            if uniform:
                self._bg = libhmmer.p7_bg.p7_bg_CreateUniform(alphabet._abc)
            else:
                self._bg = libhmmer.p7_bg.p7_bg_Create(alphabet._abc)
409
        if self._bg == NULL:
410
            raise AllocationError("P7_BG", sizeof(P7_BG))
411
        # expose the residue frequencies as the `residue_frequencies` attribute
412
413
414
415
416
        self.residue_frequencies = VectorF.__new__(VectorF)
        self.residue_frequencies._data = &(self._bg.f[0])
        self.residue_frequencies._owner = self
        self.residue_frequencies._n = self.residue_frequencies._shape[0] = self.alphabet.K

417
418
419
420
421
422
    def __dealloc__(self):
        libhmmer.p7_bg.p7_bg_Destroy(self._bg)

    def __copy__(self):
        return self.copy()

423
    # --- Properties ---------------------------------------------------------
424
425

    @property
426
    def L(self):
427
428
        """`int`: The mean of the null model length distribution, in residues.
        """
429
        assert self._bg != NULL
430
        return self._L
431

432
433
    @L.setter
    def L(self, int L):
434
435
        assert self._bg != NULL

436
437
438
439
        cdef int    status
        cdef P7_BG* bg     = self._bg

        with nogil:
440
            status = libhmmer.p7_bg.p7_bg_SetLength(bg, L)
441
442
        if status != libeasel.eslOK:
            raise UnexpectedError(status, "p7_bg_SetLength")
443
444
445
446
447
        self._L = L

    @property
    def transition_probability(self):
        r"""`float`: The null1 transition probability (:math:`\frac{L}{L+1}`).
448

449
        .. versionadded:: 0.4.0
450

451
452
453
454
455
456
        """
        assert self._bg != NULL
        return self._bg.p1

    @property
    def omega(self):
457
        """`float`: The *prior* on *null2*/*null3*.
458

459
        .. versionadded:: 0.4.0
460

461
462
463
464
465
466
467
468
        """
        assert self._bg != NULL
        return self._bg.omega

    @omega.setter
    def omega(self, float omega):
        assert self._bg != NULL
        self._bg.omega = omega
469
470
471
472

    # --- Methods ------------------------------------------------------------

    cpdef Background copy(self):
473
474
475
476
        """copy(self)\n--

        Create a copy of the null model with the same parameters.

477
        """
478
479
        assert self._bg != NULL

480
        cdef Background new = Background.__new__(Background)
481
482
        with nogil:
            new._bg = libhmmer.p7_bg.p7_bg_Clone(self._bg)
483
        if new._bg == NULL:
484
            raise AllocationError("P7_BG", sizeof(P7_BG))
485
486
487
488
489
490
491

        new.alphabet = self.alphabet
        new.uniform = self.uniform
        new.residue_frequencies = VectorF.__new__(VectorF)
        new.residue_frequencies._data = &(new._bg.f[0])
        new.residue_frequencies._owner = new
        new.residue_frequencies._n = new.residue_frequencies._shape[0] = new.alphabet.K
492
493
494
        return new


495
496
cdef class Builder:
    """A factory for constructing new HMMs from raw sequences.
497

498
499
500
    Attributes:
        alphabet (`~pyhmmer.easel.Alphabet`): The alphabet the builder is
            configured to use to convert sequences to HMMs.
501
502
        randomness (`~pyhmmer.easel.Randomness`): The random number generator
            being used by the builder.
503
504
505
506
507
508
        score_matrix (`str`): The name of the substitution matrix used to
            build HMMs for single sequences.
        popen (`float`): The *gap open* probability to use when building
            HMMs from single sequences.
        pextend (`float`): The *gap extend* probability to use when building
            HMMs from single sequences.
509

510
511
    .. versionadded:: 0.2.0

512
    .. versionchanged:: 0.4.2
513
       Added the `~Builder.randomness` attribute.
514

515
516
    """

517
518
519
    _ARCHITECTURE_STRATEGY = dict(BUILDER_ARCHITECTURE_STRATEGY)
    _WEIGHTING_STRATEGY = dict(BUILDER_WEIGHTING_STRATEGY)
    _EFFECTIVE_STRATEGY = dict(BUILDER_EFFECTIVE_STRATEGY)
520
521
522
523
524
525
526
527
528
529
530
531
532

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self):
        self._bld = NULL
        self.alphabet = None

    def __init__(
        self,
        Alphabet alphabet,
        *,
        str architecture="fast",
        str weighting="pb",
533
        object effective_number="entropy",
534
535
536
        str prior_scheme="alphabet",
        float symfrac=0.5,
        float fragthresh=0.5,
537
538
539
        double wid=0.62,
        double esigma=45.0,
        double eid=0.62,
540
541
542
543
544
545
        int EmL=200,
        int EmN=200,
        int EvL=200,
        int EvN=200,
        int EfL=100,
        int EfN=200,
546
        double Eft=0.04,
547
        uint32_t seed=42,
548
        object ere=None,
549
550
        object popen=None,
        object pextend=None,
551
        object score_matrix=None,
552
553
        object window_length=None,
        object window_beta=None,
554
    ):
555
        """__init__(self, alphabet, *, architecture="fast", weighting="pb", effective_number="entropy", prior_scheme="alpha", symfrac=0.5, fragthresh=0.5, wid=0.62, esigma=45.0, eid=0.62, EmL=200, EmN=200, EvL=200, EvN=200, EfL=100, EfN=200, Eft=0.04, seed=42, ere=None, popen=None, pextend=None, window_length=None, window_beta=None)\n--
556
557
558
559
560
561
562

        Create a new sequence builder with the given configuration.

        Arguments:
            alphabet (`~pyhmmer.easel.Alphabet`): The alphabet the builder
                expects the sequences to be in.

563
        Keyword Arguments:
564
565
566
567
568
569
570
571
572
573
574
575
            architecture (`str`): The algorithm to use to determine the
                model architecture, either ``"fast"`` (the default), or
                ``"hand"``.
            weighting (`str`): The algorithm to use for relative sequence
                weighting, either ``"pb"`` (the default), ``"gsc"``,
                ``"blosum"``, ``"none"``, or ``"given"``.
            effective_number (`str`, `int`, or `float`): The algorithm to
                use to determine the effective sequence number, either
                ``"entropy"`` (the default), ``"exp"``, ``"clust"``, ``"none"``.
                A number can also be given directly to set the effective
                sequence number manually.
            prior_scheme (`str`): The choice of mixture Dirichlet prior when
576
577
                parameterizing  from counts, either ``"laplace"``
                or ``"alphabet"`` (the default).
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
            symfrac (`float`): The residue occurrence threshold for fast
                architecture determination.
            fragthresh (`float`): A threshold such that a sequence is called
                a fragment when :math:`L \\le fragthresh \times alen`.
            wid (`double`): The percent identity threshold for BLOSUM relative
                weighting.
            esigma (`float`): The minimum total relative entropy parameter
                for effective number entropy weights.
            eid (`float`): The percent identity threshold for effective
                number clustering.
            EmL (`int`): The length of sequences generated for MSV fitting.
            EmN (`int`): The number of sequences generated for MSV fitting.
            EvL (`int`): The lenght of sequences generated for Viterbi fitting.
            EvN (`int`): The number of sequences generated for Viterbi fitting.
            EfL (`int`): The lenght of sequences generated for Forward fitting.
            EfN (`int`): The number of sequences generated for Forward fitting.
            Eft (`float`): The tail mass used for Forward fitting.
            seed (`int`): The seed to use to initialize the internal random
                number generator. If ``0`` is given, an arbitrary seed will
                be chosen based on the current time.
            ere (`double`, optional): The relative entropy target for effective
                number weighting, or `None`.
600
601
602
            popen (`float`, optional): The *gap open* probability to use
                when building HMMs from single sequences. The default value
                depends on the alphabet: *0.02* for proteins,
603
                *0.03125* for nucleotides.
604
605
606
607
608
609
610
611
612
            pextend (`float`, optional): The *gap extend* probability to use
                when building HMMs from single sequences. Default depends on
                the alphabet: *0.4* for proteins, *0.75* for nucleotides.
            score_matrix (`str`, optional): The name of the score matrix to
                use when building HMMs from single sequences. The only
                allowed value for nucleotide alphabets is *DNA1*. For
                proteins, *PAM30*, *PAM70*, *PAM120*, *PAM240*, *BLOSUM45*,
                *BLOSUM50*, *BLOSUM62* (the default), *BLOSUM80* or
                *BLOSUM90* can be used.
613
614
            window_length (`float`, optional): The window length for
                nucleotide sequences, essentially the max expected hit length.
615
                *If given, takes precedence over* ``window_beta``.
616
617
            window_beta (`float`, optional): The tail mass at which window
                length is determined for nucleotide sequences.
618

619
        """
620
        # extract alphabet and create raw P7_BUILDER object
621
        self.alphabet = alphabet
622
        abcty = alphabet._abc.type
623
624
        with nogil:
            self._bld = libhmmer.p7_builder.p7_builder_Create(NULL, alphabet._abc)
625
        if self._bld == NULL:
626
            raise AllocationError("P7_BG", sizeof(P7_BG))
627

628
629
630
631
632
633
634
635
        # create a Randomness object exposing the internal RNG
        self.randomness = Randomness.__new__(Randomness)
        self.randomness._owner = self
        self.randomness._rng = self._bld.r

        # store the seed given at initialization and reseed the RNG
        self.seed = seed

636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
        # set numeric parameters
        self._bld.symfrac = symfrac
        self._bld.fragthresh = fragthresh
        self._bld.wid = wid
        self._bld.esigma = esigma
        self._bld.eid = eid
        self._bld.EmL = EmL
        self._bld.EmN = EmN
        self._bld.EvL = EvL
        self._bld.EvN = EvN
        self._bld.EfL = EfL
        self._bld.EfN = EfN
        self._bld.Eft = Eft

        # set the architecture strategy
651
        self.architecture = architecture
652
        _arch = BUILDER_ARCHITECTURE_STRATEGY.get(architecture)
653
654
655
656
657
658
        if _arch is not None:
            self._bld.arch_strategy = _arch
        else:
            raise ValueError(f"Invalid value for 'architecture': {architecture}")

        # set the weighting strategy
659
        self.weighting = weighting
660
        _weighting = BUILDER_WEIGHTING_STRATEGY.get(weighting)
661
662
663
664
665
666
        if _weighting is not None:
            self._bld.wgt_strategy = _weighting
        else:
            raise ValueError(f"Invalid value for 'weighting': {weighting}")

        # set the effective sequence number strategy
667
        self.effective_number = effective_number
668
669
670
        if isinstance(effective_number, (int, float)):
            self._bld.effn_strategy = p7_effnchoice_e.p7_EFFN_SET
            self._bld.eset = effective_number
671
        elif isinstance(effective_number, str):
672
            _effn = BUILDER_EFFECTIVE_STRATEGY.get(effective_number)
673
674
675
676
            if _effn is not None:
                self._bld.effn_strategy = _effn
            else:
                raise ValueError(f"Invalid value for 'effective_number': {effective_number}")
677
678
679
        else:
            ty = type(effective_number).__name__
            raise TypeError(f"Invalid type for 'effective_number': {ty}")
680

681
682
683
684
685
686
687
688
689
690
        # set the RE target if given one, otherwise the default
        # is alphabet dependent
        if ere is not None:
            self._bld.re_target = ere
        elif alphabet.is_nucleotide():
            self._bld.re_target = libhmmer.p7_ETARGET_DNA
        elif alphabet.is_amino():
            self._bld.re_target = libhmmer.p7_ETARGET_AMINO
        else:
            self._bld.re_target = libhmmer.p7_ETARGET_OTHER
691
692

        # set the prior scheme
693
        self.prior_scheme = prior_scheme
694
695
696
697
698
699
700
701
702
        if prior_scheme is None:
            self._bld.prior = NULL
        elif prior_scheme == "laplace":
            self._bld.prior = libhmmer.p7_prior.p7_prior_CreateLaplace(self.alphabet._abc)
        elif prior_scheme == "alphabet":
            if alphabet.is_amino():
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateAmino()
            elif alphabet.is_nucleotide():
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateNucleic()
703
            else:
704
705
706
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateLaplace(self.alphabet._abc)
        else:
            raise ValueError("Invalid value for 'prior_scheme': {prior_scheme}")
707

708
        # set the gap probabilities and score matrix using alphabet-specific
709
        # defaults (this is only used when building a HMM for a single sequence)
710
        if alphabet.is_nucleotide():
711
            self.score_matrix = "DNA1" if score_matrix is None else score_matrix
712
713
714
            self.popen = 0.03125 if popen is None else popen
            self.pextend = 0.75 if pextend is None else pextend
        else:
715
            self.score_matrix = "BLOSUM62" if score_matrix is None else score_matrix
716
717
718
            self.popen = 0.02 if popen is None else popen
            self.pextend = 0.4 if pextend is None else pextend

719
720
721
722
723
724
        # set the window options
        self.window_length = window_length
        if window_beta is None:
            self.window_beta = libhmmer.p7_builder.p7_DEFAULT_WINDOW_BETA
        else:
            self.window_beta = window_beta
725

726
727
728
    def __dealloc__(self):
        libhmmer.p7_builder.p7_builder_Destroy(self._bld)

729
730
731
    def __copy__(self):
        return self.copy()

732
733
734
735
    # --- Properties ---------------------------------------------------------

    @property
    def seed(self):
736
737
738
739
740
        """`int`: The seed given at builder initialization.

        Setting this attribute to a different value will cause the internal
        random number generator to be reseeded immediately.

741
        .. versionchanged:: 0.4.2
742
           Avoid shadowing initial null seeds given on builder initialization.
743

744
        """
745
        return self._seed
746
747
748

    @seed.setter
    def seed(self, uint32_t seed):
749
        self._seed = seed
750
        self._bld.do_reseeding = seed != 0
751
        self.randomness._seed(seed)
752

753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
    @property
    def window_length(self):
        """`int` or `None`: The window length for nucleotide sequences.
        """
        assert self._bld != NULL
        return None if self._bld.w_len == -1 else self._bld.w_len

    @window_length.setter
    def window_length(self, object window_length):
        assert self._bld != NULL
        if window_length is None:
            self._bld.w_len = -1
        elif window_length > 3:
            self._bld.w_len = window_length
        else:
            raise ValueError(f"Invalid window length: {window_length!r}")

    @property
    def window_beta(self):
        """`float`: The tail mass at which window length is determined.
        """
        assert self._bld != NULL
        return self._bld.w_beta

    @window_beta.setter
    def window_beta(self, double window_beta):
        assert self._bld != NULL
        if window_beta > 1 or window_beta < 0:
            raise ValueError(f"Invalid window tail mass: {window_beta!r}")
        self._bld.w_beta = window_beta

784
785
    # --- Methods ------------------------------------------------------------

786
787
788
789
790
    cpdef tuple build(
        self,
        DigitalSequence sequence,
        Background background,
    ):
791
        """build(self, sequence, background)\n--
792
793

        Build a new HMM from ``sequence`` using the builder configuration.
794
795
796
797

        Arguments:
            sequence (`~pyhmmer.easel.DigitalSequence`): A single biological
                sequence in digital mode to build a HMM with.
798
799
            background (`pyhmmer.plan7.background`): The background model
                to use to create the HMM.
800

801
802
803
804
        Returns:
            (`HMM`, `Profile`, `OptimizedProfile`): A tuple containing the
            new HMM as well as profiles to be used directly in a `Pipeline`.

805
        Raises:
806
807
            `~pyhmmer.errors.AlphabetMismatch`: When either ``sequence`` or
                ``background`` have the wrong alphabet for this builder.
808
809
            `ValueError`: When the HMM cannot be created successfully
                from the input; the error message contains more details.
810

811
812
813
814
815
816
817
818
819
820
821
        Hint:
            The score matrix and the gap probabilities used here can be set
            when initializing the builder, or changed by setting a new value
            to the right attribute::

                >>> alphabet = easel.Alphabet.amino()
                >>> background = plan7.Background(alphabet)
                >>> builder = plan7.Builder(alphabet, popen=0.05)
                >>> builder.score_matrix = "BLOSUM90"
                >>> hmm, _, _ = builder.build(proteins[0], background)

822
823
824
        .. versionchanged:: 0.4.6
           Sets the `HMM.creation_time` attribute with the current time.

825
826
827
        .. versionchanged:: 0.4.10
           The score system is now cached between calls to `Builder.build`.

828
        """
829
830
        assert self._bld != NULL

831
832
833
        cdef int              status
        cdef HMM              hmm     = HMM.__new__(HMM)
        cdef Profile          profile = Profile.__new__(Profile)
834
        cdef OptimizedProfile opti    = OptimizedProfile.__new__(OptimizedProfile)
835
        cdef bytes            mx      = self.score_matrix.encode('utf-8')
836
        cdef char*            mx_ptr  = <char*> mx
837
        cdef str              msg
838

839
840
        # reseed RNG used by the builder if needed
        if self._bld.do_reseeding:
841
            self.randomness._seed(self.seed)
842

843
        # check alphabet and assign it to newly created objects
844
        hmm.alphabet = profile.alphabet = opti.alphabet = self.alphabet
845
        if not self.alphabet._eq(background.alphabet):
846
            raise AlphabetMismatch(self.alphabet, background.alphabet)
847
        if not self.alphabet._eq(sequence.alphabet):
848
            raise AlphabetMismatch(self.alphabet, sequence.alphabet)
849

850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
        # only load the score system if it hasn't been loaded already,
        # or if a different score system is in use
        if self._bld.S == NULL or strcmp(self._bld.S.name, mx_ptr) != 0:
            with nogil:
                status = libhmmer.p7_builder.p7_builder_LoadScoreSystem(
                    self._bld,
                    mx_ptr,
                    self.popen,
                    self.pextend,
                    background._bg
                )
            if status == libeasel.eslEINVAL:
                msg = self._bld.errbuf.decode('utf-8', 'ignore')
                raise RuntimeError(f"failed to set single sequence score system: {msg}")
            elif status != libeasel.eslOK:
                raise UnexpectedError(status, "p7_builder_LoadScoreSystem")
        else:
            self._bld.popen = self.popen
            self._bld.pextend = self.pextend
869
870
871

        # build HMM and profiles
        with nogil:
872
873
874
875
876
877
878
879
880
            status = libhmmer.p7_builder.p7_SingleBuilder(
                self._bld,
                sequence._sq,
                background._bg,
                &hmm._hmm, # HMM
                NULL, # traceback
                &profile._gm, # profile,
                &opti._om, # optimized profile
            )
881
        if status == libeasel.eslOK:
882
            hmm.command_line = " ".join(sys.argv)
883
        elif status == libeasel.eslEINVAL:
884
885
            msg = self._bld.errbuf.decode("utf-8", "replace")
            raise ValueError("Could not build HMM: {}".format(msg))
886
887
888
        else:
            raise UnexpectedError(status, "p7_SingleBuilder")

889
890
891
        # return newly built HMM, profile and optimized profile
        return hmm, profile, opti

892
893
894
895
896
    cpdef tuple build_msa(
        self,
        DigitalMSA msa,
        Background background,
    ):
897
898
899
900
901
902
903
904
905
906
        """build_msa(self, msa, background)\n--

        Build a new HMM from ``msa`` using the builder configuration.

        Arguments:
            msa (`~pyhmmer.easel.DigitalMSA`): A multiple sequence
                alignment in digital mode to build a HMM with.
            background (`pyhmmer.plan7.background`): The background model
                to use to create the HMM.

907
908
909
910
        Returns:
            (`HMM`, `Profile`, `OptimizedProfile`): A tuple containing the
            new HMM as well as profiles to be used directly in a `Pipeline`.

911
        Raises:
912
913
            `~pyhmmer.errors.AlphabetMismatch`: When either ``msa`` or
                ``background`` have the wrong alphabet for this builder.
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
            `ValueError`: When the HMM cannot be created successfully
                from the input; the error message contains more details.

        Caution:
            HMMER requires that every HMM has a name, so the `Builder` will
            attempt to use the name of the sequence to name the HMM. Passing
            an MSA without a name will result in an error::

                >>> alphabet = easel.Alphabet.amino()
                >>> msa = easel.TextMSA(sequences=[
                ...   easel.TextSequence(name=b"ustiA", sequence="YAIG"),
                ...   easel.TextSequence(name=b"ustiB", sequence="YVIG")
                ... ])
                >>> builder = plan7.Builder(alphabet)
                >>> background = plan7.Background(alphabet)
                >>> builder.build_msa(msa.digitize(alphabet), background)
                Traceback (most recent call last):
                  ...
                ValueError: Could not build HMM: Unable to name the HMM.
933

934
935
        .. versionadded:: 0.3.0

936
937
938
        .. versionchanged:: 0.4.6
           Sets the `HMM.creation_time` attribute with the current time.

939
940
941
        """
        assert self._bld != NULL

942
943
944
945
        cdef int              status
        cdef HMM              hmm     = HMM.__new__(HMM)
        cdef Profile          profile = Profile.__new__(Profile)
        cdef OptimizedProfile opti    = OptimizedProfile.__new__(OptimizedProfile)
946
        cdef str              msg
947
        cdef size_t           k
948

949
950
        # unset builder probabilities and score system if they were set
        # by a call to `build`, since we won't be needing them here
951
        self._bld.popen = self._bld.pextend = -1
952
953
954
955
956
957
        if self._bld.S != NULL:
            libeasel.scorematrix.esl_scorematrix_Destroy(self._bld.S)
            self._bld.S = NULL
        if self._bld.Q != NULL:
            libeasel.dmatrix.esl_dmatrix_Destroy(self._bld.Q)
            self._bld.Q = NULL
958
959
960

        # reseed RNG used by the builder if needed
        if self._bld.do_reseeding:
961
            self.randomness._seed(self.seed)
962
963
964

        # check alphabet and assign it to newly created objects
        hmm.alphabet = profile.alphabet = opti.alphabet = self.alphabet
965
        if not self.alphabet._eq(background.alphabet):
966
            raise AlphabetMismatch(self.alphabet, background.alphabet)
967
        if not self.alphabet._eq(msa.alphabet):
968
            raise AlphabetMismatch(self.alphabet, msa.alphabet)
969

970
        # build HMM
971
972
973
974
975
976
977
978
979
980
981
982
983
        with nogil:
            # build HMM and profiles
            status = libhmmer.p7_builder.p7_Builder(
                self._bld,
                msa._msa,
                background._bg,
                &hmm._hmm, # HMM
                NULL, # traceback
                &profile._gm, # profile,
                &opti._om, # optimized profile
                NULL # modified msa
            )
        if status == libeasel.eslOK:
984
            hmm.command_line = " ".join(sys.argv)
985
        elif status == libeasel.eslEINVAL:
986
987
            msg = self._bld.errbuf.decode("utf-8", "replace")
            raise ValueError("Could not build HMM: {}".format(msg))
988
989
        else:
            raise UnexpectedError(status, "p7_Builder")
990

991
992
993
        # return newly built HMM, profile and optimized profile
        return hmm, profile, opti

994
    cpdef Builder copy(self):
995
996
997
998
        """copy(self)\n--

        Create a duplicate `Builder` instance with the same arguments.

999
1000
        """
        assert self._bld != NULL
For faster browsing, not all history is shown. View entire blame