plan7.pyx 247 KB
Newer Older
1
2
# coding: utf-8
# cython: language_level=3, linetrace=True
3
4
5
6
7
8
9
10
11
"""High-level interface to the Plan7 data model.

Plan7 is the model architecture used by HMMER since HMMER2.

See Also:
    Details about the Plan 7 architecture in the `HMMER documentation
    <http://www.csb.yale.edu/userguides/seq/hmmer/docs/node11.html>`_.

"""
12

13
14
# --- C imports --------------------------------------------------------------

15
cimport cython
16
from cpython.bytes cimport PyBytes_FromStringAndSize
17
from cpython.list cimport PyList_New, PyList_SET_ITEM
18
from cpython.ref cimport PyObject
19
from cpython.exc cimport PyErr_Clear
20
21
from cpython.unicode cimport PyUnicode_DecodeASCII
from libc.math cimport exp, ceil
22
from libc.stddef cimport ptrdiff_t
23
from libc.stdio cimport printf
24
from libc.stdlib cimport calloc, malloc, realloc, free
25
from libc.stdint cimport uint8_t, uint32_t, int64_t
26
from libc.stdio cimport fprintf, FILE, stdout, fclose
27
from libc.string cimport memset, memcpy, memmove, strdup, strndup, strlen, strcmp, strncpy
28
29
from libc.time cimport ctime, strftime, time, time_t, tm, localtime_r
from unicode cimport PyUnicode_DATA, PyUnicode_KIND, PyUnicode_READ, PyUnicode_READY, PyUnicode_GET_LENGTH
30

31
cimport libeasel
32
cimport libeasel.sq
33
cimport libeasel.alphabet
34
cimport libeasel.dmatrix
35
cimport libeasel.fileparser
36
cimport libeasel.random
37
cimport libeasel.scorematrix
38
cimport libeasel.getopts
39
cimport libeasel.vec
40
cimport libhmmer
41
cimport libhmmer.modelconfig
42
cimport libhmmer.modelstats
43
cimport libhmmer.p7_alidisplay
44
cimport libhmmer.p7_hmm
45
cimport libhmmer.p7_builder
46
cimport libhmmer.p7_bg
47
cimport libhmmer.p7_domaindef
48
cimport libhmmer.p7_hit
49
50
cimport libhmmer.p7_hmmfile
cimport libhmmer.p7_pipeline
51
cimport libhmmer.p7_prior
52
cimport libhmmer.p7_profile
53
cimport libhmmer.p7_scoredata
54
cimport libhmmer.p7_tophits
55
56
cimport libhmmer.p7_trace
cimport libhmmer.tracealign
57
from libeasel cimport eslERRBUFSIZE, eslCONST_LOG2R
58
from libeasel.alphabet cimport ESL_ALPHABET, esl_alphabet_Create, esl_abc_ValidateType
59
60
from libeasel.getopts cimport ESL_GETOPTS, ESL_OPTIONS
from libeasel.sq cimport ESL_SQ
61
from libeasel.keyhash cimport ESL_KEYHASH
62
from libeasel.fileparser cimport ESL_FILEPARSER
63
from libhmmer cimport p7_LOCAL, p7_EVPARAM_UNSET, p7_CUTOFF_UNSET, p7_NEVPARAM, p7_NCUTOFFS, p7_offsets_e, p7_cutoffs_e, p7_evparams_e
64
from libhmmer.logsum cimport p7_FLogsumInit
65
from libhmmer.p7_builder cimport P7_BUILDER, p7_archchoice_e, p7_wgtchoice_e, p7_effnchoice_e
66
from libhmmer.p7_hmm cimport p7H_NTRANSITIONS, p7H_TC, p7H_GA, p7H_NC, p7H_MAP
67
from libhmmer.p7_hmmfile cimport p7_hmmfile_formats_e
68
from libhmmer.p7_hit cimport p7_hitflags_e, P7_HIT
69
from libhmmer.p7_alidisplay cimport P7_ALIDISPLAY
70
from libhmmer.p7_pipeline cimport P7_PIPELINE, p7_pipemodes_e, p7_zsetby_e, p7_strands_e, p7_complementarity_e
71
from libhmmer.p7_profile cimport p7_LOCAL, p7_GLOCAL, p7_UNILOCAL, p7_UNIGLOCAL
72
from libhmmer.p7_trace cimport P7_TRACE, p7t_statetype_e
73

74
IF HMMER_IMPL == "VMX":
75
    from libhmmer.impl_vmx cimport p7_oprofile, p7_omx, impl_Init
76
    from libhmmer.impl_vmx.io cimport p7_oprofile_Write, p7_oprofile_ReadMSV, p7_oprofile_ReadRest
77
78
79
80
81
82
83
    from libhmmer.impl_vmx.p7_oprofile cimport (
        P7_OPROFILE,
        p7O_NQB,
        p7_oprofile_Compare,
        p7_oprofile_Dump,
        p7_oprofile_Sizeof,
    )
84
ELIF HMMER_IMPL == "SSE":
85
    from libhmmer.impl_sse cimport p7_oprofile, p7_omx, impl_Init, p7_SSVFilter, p7O_EXTRA_SB
86
    from libhmmer.impl_sse.io cimport p7_oprofile_Write, p7_oprofile_ReadMSV, p7_oprofile_ReadRest
87
88
89
90
91
92
93
    from libhmmer.impl_sse.p7_oprofile cimport (
        P7_OPROFILE,
        p7O_NQB,
        p7_oprofile_Compare,
        p7_oprofile_Dump,
        p7_oprofile_Sizeof,
    )
94

95
96
97
98
from .easel cimport (
    Alphabet,
    Sequence,
    DigitalSequence,
99
    KeyHash,
100
101
102
103
    MSA,
    TextMSA,
    DigitalMSA,
    VectorF,
104
105
    MatrixF,
    MatrixU8,
106
)
107
from .reexports.p7_tophits cimport p7_tophits_Reuse
108
109
110
111
112
113
114
115
116
117
118
from .reexports.p7_hmmfile cimport (
    read_asc20hmm,
    read_asc30hmm,
    read_bin30hmm,
    v3a_magic,
    v3b_magic,
    v3c_magic,
    v3d_magic,
    v3e_magic,
    v3f_magic
)
119

120
121
122
123
124
IF UNAME_SYSNAME == "Linux":
    include "fileobj/linux.pxi"
ELIF UNAME_SYSNAME == "Darwin" or UNAME_SYSNAME.endswith("BSD"):
    include "fileobj/bsd.pxi"

125
126
# --- Python imports ---------------------------------------------------------

127
import array
128
import collections.abc
129
import datetime
130
import errno
131
import math
132
import io
133
import itertools
134
135
import os
import sys
136
137
import warnings

138
from .errors import AllocationError, UnexpectedError, AlphabetMismatch
139
from .utils import peekable
140
141


142
143
# --- Constants --------------------------------------------------------------

144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
cdef dict BUILDER_ARCHITECTURE_STRATEGY = {
    "fast": p7_archchoice_e.p7_ARCH_FAST,
    "hand": p7_archchoice_e.p7_ARCH_HAND,
}

cdef dict BUILDER_WEIGHTING_STRATEGY = {
    "pb": p7_wgtchoice_e.p7_WGT_PB,
    "gsc": p7_wgtchoice_e.p7_WGT_GSC,
    "blosum": p7_wgtchoice_e.p7_WGT_BLOSUM,
    "none": p7_wgtchoice_e.p7_WGT_NONE,
    "given": p7_wgtchoice_e.p7_WGT_GIVEN,
}

cdef dict BUILDER_EFFECTIVE_STRATEGY = {
    "entropy": p7_effnchoice_e.p7_EFFN_ENTROPY,
    "exp": p7_effnchoice_e.p7_EFFN_ENTROPY_EXP,
    "clust": p7_effnchoice_e.p7_EFFN_CLUST,
    "none": p7_effnchoice_e.p7_EFFN_NONE,
}

164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
cdef dict HMM_FILE_FORMATS = {
    "2.0": p7_hmmfile_formats_e.p7_HMMFILE_20,
    "3/a": p7_hmmfile_formats_e.p7_HMMFILE_3a,
    "3/b": p7_hmmfile_formats_e.p7_HMMFILE_3b,
    "3/c": p7_hmmfile_formats_e.p7_HMMFILE_3c,
    "3/d": p7_hmmfile_formats_e.p7_HMMFILE_3d,
    "3/e": p7_hmmfile_formats_e.p7_HMMFILE_3e,
    "3/f": p7_hmmfile_formats_e.p7_HMMFILE_3f,
}

cdef dict HMM_FILE_MAGIC = {
    v3a_magic: p7_hmmfile_formats_e.p7_HMMFILE_3a,
    v3b_magic: p7_hmmfile_formats_e.p7_HMMFILE_3b,
    v3c_magic: p7_hmmfile_formats_e.p7_HMMFILE_3c,
    v3d_magic: p7_hmmfile_formats_e.p7_HMMFILE_3d,
    v3e_magic: p7_hmmfile_formats_e.p7_HMMFILE_3e,
    v3f_magic: p7_hmmfile_formats_e.p7_HMMFILE_3f,
}

183
184
185
186
187
cdef dict PIPELINE_BIT_CUTOFFS = {
    "gathering": libhmmer.p7_hmm.p7H_GA,
    "noise": libhmmer.p7_hmm.p7H_NC,
    "trusted": libhmmer.p7_hmm.p7H_TC,
}
188

189
# --- Cython classes ---------------------------------------------------------
190

191
192

cdef class Alignment:
193
194
195
196
197
    """An alignment of a sequence to a profile.

    Attributes:
        domain (`Domain`): The domain this alignment corresponds to.

198
    """
199
200
201
202
203
204
205

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self, Domain domain):
        self.domain = domain
        self._ad = domain._dom.ad

206
207
208
    def __len__(self):
        return self.hmm_to - self.hmm_from

209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    def __str__(self):
        assert self._ad != NULL

        cdef int    status
        cdef object buffer = io.BytesIO()
        cdef FILE*  fp     = fopen_obj(buffer, "w")

        try:
            status = libhmmer.p7_alidisplay.p7_nontranslated_alidisplay_Print(
                fp,
                self._ad,
                0,
                -1,
                False,
            )
            if status == libeasel.eslEWRITE:
                raise OSError("Failed to write alignment")
            elif status != libeasel.eslOK:
                raise UnexpectedError(status, "p7_alidisplay_Print")
        finally:
            fclose(fp)

        return buffer.getvalue().decode("ascii")


234
235
236
237
    # --- Properties ---------------------------------------------------------

    @property
    def hmm_from(self):
238
239
        """`int`: The start coordinate of the alignment in the query HMM.
        """
240
        assert self._ad != NULL
241
242
243
244
        return self._ad.hmmfrom

    @property
    def hmm_to(self):
245
246
        """`int`: The end coordinate of the alignment in the query HMM.
        """
247
        assert self._ad != NULL
248
249
250
251
        return self._ad.hmmto

    @property
    def hmm_name(self):
252
253
        """`bytes`: The name of the query HMM.
        """
254
255
        assert self._ad != NULL
        assert self._ad.hmmname != NULL
256
257
        return <bytes> self._ad.hmmname

258
259
    @property
    def hmm_accession(self):
260
        """`bytes`: The accession of the query, or its name if it has none.
261
262
263

        .. versionadded:: 0.1.4

264
        """
265
266
        assert self._ad != NULL
        assert self._ad.hmmacc != NULL
267
268
        return <bytes> self._ad.hmmacc

269
270
    @property
    def hmm_sequence(self):
271
272
        """`str`: The sequence of the query HMM in the alignment.
        """
273
        assert self._ad != NULL
274
275
276
277
        return self._ad.model.decode('ascii')

    @property
    def target_from(self):
278
279
        """`int`: The start coordinate of the alignment in the target sequence.
        """
280
        assert self._ad != NULL
281
282
283
284
        return self._ad.sqfrom

    @property
    def target_name(self):
285
286
        """`bytes`: The name of the target sequence.
        """
287
288
        assert self._ad != NULL
        assert self._ad.sqname != NULL
289
290
291
292
        return <bytes> self._ad.sqname

    @property
    def target_sequence(self):
293
294
        """`str`: The sequence of the target sequence in the alignment.
        """
295
        assert self._ad != NULL
296
297
298
299
        return self._ad.aseq.decode('ascii')

    @property
    def target_to(self):
300
301
        """`int`: The end coordinate of the alignment in the target sequence.
        """
302
        assert self._ad != NULL
303
304
305
306
        return self._ad.sqto

    @property
    def identity_sequence(self):
307
308
        """`str`: The identity sequence between the query and the target.
        """
309
        assert self._ad != NULL
310
311
312
        return self._ad.mline.decode('ascii')


313
314
cdef class Background:
    """The null background model of HMMER.
315
316
317
318
319
320

    Attributes:
        alphabet (`~pyhmmer.easel.Alphabet`): The alphabet of the
            backgound model.
        uniform (`bool`): Whether or not the null model has been created
            with uniform frequencies.
321
322
        residue_frequencies (`~pyhmmer.easel.VectorF`): The null1 background
            residue frequencies.
323

324
    .. versionchanged:: 0.4.0
325
326
       Added the `residue_frequencies` attribute.

327
328
329
330
331
332
    """

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self):
        self._bg = NULL
333
        self._L  = 350
334
        self.alphabet = None
335
        self.residue_frequencies = None
336
        self.uniform = False
337
338

    def __init__(self, Alphabet alphabet, bint uniform=False):
339
340
341
        """__init__(self, alphabet, uniform=False)\n--

        Create a new background model for the given ``alphabet``.
342
343
344
345
346
347
348
349

        Arguments:
          alphabet (`pyhmmer.easel.Alphabet`): The alphabet to create the
              background model with.
          uniform (`bool`): Whether or not to create the null model with
              uniform frequencies. Defaults to `False`.

        """
350
        # store the alphabet so it's not deallocated
351
        self.alphabet = alphabet
352
        # store whether or not the null model has uniform frequencies
353
        self.uniform = uniform
354
        # create the background profile
355
356
357
358
359
        with nogil:
            if uniform:
                self._bg = libhmmer.p7_bg.p7_bg_CreateUniform(alphabet._abc)
            else:
                self._bg = libhmmer.p7_bg.p7_bg_Create(alphabet._abc)
360
        if self._bg == NULL:
361
            raise AllocationError("P7_BG", sizeof(P7_BG))
362
        # expose the residue frequencies as the `residue_frequencies` attribute
363
364
365
366
367
        self.residue_frequencies = VectorF.__new__(VectorF)
        self.residue_frequencies._data = &(self._bg.f[0])
        self.residue_frequencies._owner = self
        self.residue_frequencies._n = self.residue_frequencies._shape[0] = self.alphabet.K

368
369
370
371
372
373
    def __dealloc__(self):
        libhmmer.p7_bg.p7_bg_Destroy(self._bg)

    def __copy__(self):
        return self.copy()

374
    # --- Properties ---------------------------------------------------------
375
376

    @property
377
    def L(self):
378
379
        """`int`: The mean of the null model length distribution, in residues.
        """
380
        assert self._bg != NULL
381
        return self._L
382

383
384
    @L.setter
    def L(self, int L):
385
386
        assert self._bg != NULL

387
388
389
390
        cdef int    status
        cdef P7_BG* bg     = self._bg

        with nogil:
391
            status = libhmmer.p7_bg.p7_bg_SetLength(bg, L)
392
393
        if status != libeasel.eslOK:
            raise UnexpectedError(status, "p7_bg_SetLength")
394
395
396
397
398
        self._L = L

    @property
    def transition_probability(self):
        r"""`float`: The null1 transition probability (:math:`\frac{L}{L+1}`).
399

400
        .. versionadded:: 0.4.0
401

402
403
404
405
406
407
        """
        assert self._bg != NULL
        return self._bg.p1

    @property
    def omega(self):
408
        """`float`: The *prior* on *null2*/*null3*.
409

410
        .. versionadded:: 0.4.0
411

412
413
414
415
416
417
418
419
        """
        assert self._bg != NULL
        return self._bg.omega

    @omega.setter
    def omega(self, float omega):
        assert self._bg != NULL
        self._bg.omega = omega
420
421
422
423

    # --- Methods ------------------------------------------------------------

    cpdef Background copy(self):
424
425
426
427
        """copy(self)\n--

        Create a copy of the null model with the same parameters.

428
        """
429
430
        assert self._bg != NULL

431
        cdef Background new = Background.__new__(Background)
432
433
        with nogil:
            new._bg = libhmmer.p7_bg.p7_bg_Clone(self._bg)
434
        if new._bg == NULL:
435
            raise AllocationError("P7_BG", sizeof(P7_BG))
436
437
438
439
440
441
442

        new.alphabet = self.alphabet
        new.uniform = self.uniform
        new.residue_frequencies = VectorF.__new__(VectorF)
        new.residue_frequencies._data = &(new._bg.f[0])
        new.residue_frequencies._owner = new
        new.residue_frequencies._n = new.residue_frequencies._shape[0] = new.alphabet.K
443
444
445
        return new


446
447
cdef class Builder:
    """A factory for constructing new HMMs from raw sequences.
448

449
450
451
    Attributes:
        alphabet (`~pyhmmer.easel.Alphabet`): The alphabet the builder is
            configured to use to convert sequences to HMMs.
452
453
        randomness (`~pyhmmer.easel.Randomness`): The random number generator
            being used by the builder.
454
455
456
457
458
459
        score_matrix (`str`): The name of the substitution matrix used to
            build HMMs for single sequences.
        popen (`float`): The *gap open* probability to use when building
            HMMs from single sequences.
        pextend (`float`): The *gap extend* probability to use when building
            HMMs from single sequences.
460

461
462
    .. versionadded:: 0.2.0

463
    .. versionchanged:: 0.4.2
464
       Added the `~Builder.randomness` attribute.
465

466
467
    """

468
469
470
    _ARCHITECTURE_STRATEGY = dict(BUILDER_ARCHITECTURE_STRATEGY)
    _WEIGHTING_STRATEGY = dict(BUILDER_WEIGHTING_STRATEGY)
    _EFFECTIVE_STRATEGY = dict(BUILDER_EFFECTIVE_STRATEGY)
471
472
473
474
475
476
477
478
479
480
481
482
483

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self):
        self._bld = NULL
        self.alphabet = None

    def __init__(
        self,
        Alphabet alphabet,
        *,
        str architecture="fast",
        str weighting="pb",
484
        object effective_number="entropy",
485
486
487
        str prior_scheme="alphabet",
        float symfrac=0.5,
        float fragthresh=0.5,
488
489
490
        double wid=0.62,
        double esigma=45.0,
        double eid=0.62,
491
492
493
494
495
496
        int EmL=200,
        int EmN=200,
        int EvL=200,
        int EvN=200,
        int EfL=100,
        int EfN=200,
497
        double Eft=0.04,
498
        uint32_t seed=42,
499
        object ere=None,
500
501
        object popen=None,
        object pextend=None,
502
        object score_matrix=None,
503
504
        object window_length=None,
        object window_beta=None,
505
    ):
506
        """__init__(self, alphabet, *, architecture="fast", weighting="pb", effective_number="entropy", prior_scheme="alpha", symfrac=0.5, fragthresh=0.5, wid=0.62, esigma=45.0, eid=0.62, EmL=200, EmN=200, EvL=200, EvN=200, EfL=100, EfN=200, Eft=0.04, seed=42, ere=None, popen=None, pextend=None, window_length=None, window_beta=None)\n--
507
508
509
510
511
512
513

        Create a new sequence builder with the given configuration.

        Arguments:
            alphabet (`~pyhmmer.easel.Alphabet`): The alphabet the builder
                expects the sequences to be in.

514
        Keyword Arguments:
515
516
517
518
519
520
521
522
523
524
525
526
            architecture (`str`): The algorithm to use to determine the
                model architecture, either ``"fast"`` (the default), or
                ``"hand"``.
            weighting (`str`): The algorithm to use for relative sequence
                weighting, either ``"pb"`` (the default), ``"gsc"``,
                ``"blosum"``, ``"none"``, or ``"given"``.
            effective_number (`str`, `int`, or `float`): The algorithm to
                use to determine the effective sequence number, either
                ``"entropy"`` (the default), ``"exp"``, ``"clust"``, ``"none"``.
                A number can also be given directly to set the effective
                sequence number manually.
            prior_scheme (`str`): The choice of mixture Dirichlet prior when
527
528
                parameterizing  from counts, either ``"laplace"``
                or ``"alphabet"`` (the default).
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
            symfrac (`float`): The residue occurrence threshold for fast
                architecture determination.
            fragthresh (`float`): A threshold such that a sequence is called
                a fragment when :math:`L \\le fragthresh \times alen`.
            wid (`double`): The percent identity threshold for BLOSUM relative
                weighting.
            esigma (`float`): The minimum total relative entropy parameter
                for effective number entropy weights.
            eid (`float`): The percent identity threshold for effective
                number clustering.
            EmL (`int`): The length of sequences generated for MSV fitting.
            EmN (`int`): The number of sequences generated for MSV fitting.
            EvL (`int`): The lenght of sequences generated for Viterbi fitting.
            EvN (`int`): The number of sequences generated for Viterbi fitting.
            EfL (`int`): The lenght of sequences generated for Forward fitting.
            EfN (`int`): The number of sequences generated for Forward fitting.
            Eft (`float`): The tail mass used for Forward fitting.
            seed (`int`): The seed to use to initialize the internal random
                number generator. If ``0`` is given, an arbitrary seed will
                be chosen based on the current time.
            ere (`double`, optional): The relative entropy target for effective
                number weighting, or `None`.
551
552
553
            popen (`float`, optional): The *gap open* probability to use
                when building HMMs from single sequences. The default value
                depends on the alphabet: *0.02* for proteins,
554
                *0.03125* for nucleotides.
555
556
557
558
559
560
561
562
563
            pextend (`float`, optional): The *gap extend* probability to use
                when building HMMs from single sequences. Default depends on
                the alphabet: *0.4* for proteins, *0.75* for nucleotides.
            score_matrix (`str`, optional): The name of the score matrix to
                use when building HMMs from single sequences. The only
                allowed value for nucleotide alphabets is *DNA1*. For
                proteins, *PAM30*, *PAM70*, *PAM120*, *PAM240*, *BLOSUM45*,
                *BLOSUM50*, *BLOSUM62* (the default), *BLOSUM80* or
                *BLOSUM90* can be used.
564
565
            window_length (`float`, optional): The window length for
                nucleotide sequences, essentially the max expected hit length.
566
                *If given, takes precedence over* ``window_beta``.
567
568
            window_beta (`float`, optional): The tail mass at which window
                length is determined for nucleotide sequences.
569

570
        """
571
        # extract alphabet and create raw P7_BUILDER object
572
        self.alphabet = alphabet
573
        abcty = alphabet._abc.type
574
575
        with nogil:
            self._bld = libhmmer.p7_builder.p7_builder_Create(NULL, alphabet._abc)
576
        if self._bld == NULL:
577
            raise AllocationError("P7_BG", sizeof(P7_BG))
578

579
580
581
582
583
584
585
586
        # create a Randomness object exposing the internal RNG
        self.randomness = Randomness.__new__(Randomness)
        self.randomness._owner = self
        self.randomness._rng = self._bld.r

        # store the seed given at initialization and reseed the RNG
        self.seed = seed

587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
        # set numeric parameters
        self._bld.symfrac = symfrac
        self._bld.fragthresh = fragthresh
        self._bld.wid = wid
        self._bld.esigma = esigma
        self._bld.eid = eid
        self._bld.EmL = EmL
        self._bld.EmN = EmN
        self._bld.EvL = EvL
        self._bld.EvN = EvN
        self._bld.EfL = EfL
        self._bld.EfN = EfN
        self._bld.Eft = Eft

        # set the architecture strategy
602
        self.architecture = architecture
603
        _arch = BUILDER_ARCHITECTURE_STRATEGY.get(architecture)
604
605
606
607
608
609
        if _arch is not None:
            self._bld.arch_strategy = _arch
        else:
            raise ValueError(f"Invalid value for 'architecture': {architecture}")

        # set the weighting strategy
610
        self.weighting = weighting
611
        _weighting = BUILDER_WEIGHTING_STRATEGY.get(weighting)
612
613
614
615
616
617
        if _weighting is not None:
            self._bld.wgt_strategy = _weighting
        else:
            raise ValueError(f"Invalid value for 'weighting': {weighting}")

        # set the effective sequence number strategy
618
        self.effective_number = effective_number
619
620
621
        if isinstance(effective_number, (int, float)):
            self._bld.effn_strategy = p7_effnchoice_e.p7_EFFN_SET
            self._bld.eset = effective_number
622
        elif isinstance(effective_number, str):
623
            _effn = BUILDER_EFFECTIVE_STRATEGY.get(effective_number)
624
625
626
627
            if _effn is not None:
                self._bld.effn_strategy = _effn
            else:
                raise ValueError(f"Invalid value for 'effective_number': {effective_number}")
628
629
630
        else:
            ty = type(effective_number).__name__
            raise TypeError(f"Invalid type for 'effective_number': {ty}")
631

632
633
634
635
636
637
638
639
640
641
        # set the RE target if given one, otherwise the default
        # is alphabet dependent
        if ere is not None:
            self._bld.re_target = ere
        elif alphabet.is_nucleotide():
            self._bld.re_target = libhmmer.p7_ETARGET_DNA
        elif alphabet.is_amino():
            self._bld.re_target = libhmmer.p7_ETARGET_AMINO
        else:
            self._bld.re_target = libhmmer.p7_ETARGET_OTHER
642
643

        # set the prior scheme
644
        self.prior_scheme = prior_scheme
645
646
647
648
649
650
651
652
653
        if prior_scheme is None:
            self._bld.prior = NULL
        elif prior_scheme == "laplace":
            self._bld.prior = libhmmer.p7_prior.p7_prior_CreateLaplace(self.alphabet._abc)
        elif prior_scheme == "alphabet":
            if alphabet.is_amino():
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateAmino()
            elif alphabet.is_nucleotide():
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateNucleic()
654
            else:
655
656
657
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateLaplace(self.alphabet._abc)
        else:
            raise ValueError("Invalid value for 'prior_scheme': {prior_scheme}")
658

659
        # set the gap probabilities and score matrix using alphabet-specific
660
        # defaults (this is only used when building a HMM for a single sequence)
661
        if alphabet.is_nucleotide():
662
            self.score_matrix = "DNA1" if score_matrix is None else score_matrix
663
664
665
            self.popen = 0.03125 if popen is None else popen
            self.pextend = 0.75 if pextend is None else pextend
        else:
666
            self.score_matrix = "BLOSUM62" if score_matrix is None else score_matrix
667
668
669
            self.popen = 0.02 if popen is None else popen
            self.pextend = 0.4 if pextend is None else pextend

670
671
672
673
674
675
        # set the window options
        self.window_length = window_length
        if window_beta is None:
            self.window_beta = libhmmer.p7_builder.p7_DEFAULT_WINDOW_BETA
        else:
            self.window_beta = window_beta
676

677
678
679
    def __dealloc__(self):
        libhmmer.p7_builder.p7_builder_Destroy(self._bld)

680
681
682
    def __copy__(self):
        return self.copy()

683
684
685
686
    # --- Properties ---------------------------------------------------------

    @property
    def seed(self):
687
688
689
690
691
        """`int`: The seed given at builder initialization.

        Setting this attribute to a different value will cause the internal
        random number generator to be reseeded immediately.

692
        .. versionchanged:: 0.4.2
693
           Avoid shadowing initial null seeds given on builder initialization.
694

695
        """
696
        return self._seed
697
698
699

    @seed.setter
    def seed(self, uint32_t seed):
700
        self._seed = seed
701
        self._bld.do_reseeding = seed != 0
702
        self.randomness._seed(seed)
703

704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
    @property
    def window_length(self):
        """`int` or `None`: The window length for nucleotide sequences.
        """
        assert self._bld != NULL
        return None if self._bld.w_len == -1 else self._bld.w_len

    @window_length.setter
    def window_length(self, object window_length):
        assert self._bld != NULL
        if window_length is None:
            self._bld.w_len = -1
        elif window_length > 3:
            self._bld.w_len = window_length
        else:
            raise ValueError(f"Invalid window length: {window_length!r}")

    @property
    def window_beta(self):
        """`float`: The tail mass at which window length is determined.
        """
        assert self._bld != NULL
        return self._bld.w_beta

    @window_beta.setter
    def window_beta(self, double window_beta):
        assert self._bld != NULL
        if window_beta > 1 or window_beta < 0:
            raise ValueError(f"Invalid window tail mass: {window_beta!r}")
        self._bld.w_beta = window_beta

735
736
    # --- Methods ------------------------------------------------------------

737
738
739
740
741
    cpdef tuple build(
        self,
        DigitalSequence sequence,
        Background background,
    ):
742
        """build(self, sequence, background)\n--
743
744

        Build a new HMM from ``sequence`` using the builder configuration.
745
746
747
748

        Arguments:
            sequence (`~pyhmmer.easel.DigitalSequence`): A single biological
                sequence in digital mode to build a HMM with.
749
750
            background (`pyhmmer.plan7.background`): The background model
                to use to create the HMM.
751

752
753
754
755
        Returns:
            (`HMM`, `Profile`, `OptimizedProfile`): A tuple containing the
            new HMM as well as profiles to be used directly in a `Pipeline`.

756
        Raises:
757
758
            `~pyhmmer.errors.AlphabetMismatch`: When either ``sequence`` or
                ``background`` have the wrong alphabet for this builder.
759
760
            `ValueError`: When the HMM cannot be created successfully
                from the input; the error message contains more details.
761

762
763
764
765
766
767
768
769
770
771
772
        Hint:
            The score matrix and the gap probabilities used here can be set
            when initializing the builder, or changed by setting a new value
            to the right attribute::

                >>> alphabet = easel.Alphabet.amino()
                >>> background = plan7.Background(alphabet)
                >>> builder = plan7.Builder(alphabet, popen=0.05)
                >>> builder.score_matrix = "BLOSUM90"
                >>> hmm, _, _ = builder.build(proteins[0], background)

773
774
775
        .. versionchanged:: 0.4.6
           Sets the `HMM.creation_time` attribute with the current time.

776
777
778
        .. versionchanged:: 0.4.10
           The score system is now cached between calls to `Builder.build`.

779
        """
780
781
        assert self._bld != NULL

782
783
784
        cdef int              status
        cdef HMM              hmm     = HMM.__new__(HMM)
        cdef Profile          profile = Profile.__new__(Profile)
785
        cdef OptimizedProfile opti    = OptimizedProfile.__new__(OptimizedProfile)
786
        cdef bytes            mx      = self.score_matrix.encode('utf-8')
787
        cdef char*            mx_ptr  = <char*> mx
788
        cdef str              msg
789

790
791
        # reseed RNG used by the builder if needed
        if self._bld.do_reseeding:
792
            self.randomness._seed(self.seed)
793

794
        # check alphabet and assign it to newly created objects
795
        hmm.alphabet = profile.alphabet = opti.alphabet = self.alphabet
796
        if not self.alphabet._eq(background.alphabet):
797
            raise AlphabetMismatch(self.alphabet, background.alphabet)
798
        if not self.alphabet._eq(sequence.alphabet):
799
            raise AlphabetMismatch(self.alphabet, sequence.alphabet)
800

801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
        # only load the score system if it hasn't been loaded already,
        # or if a different score system is in use
        if self._bld.S == NULL or strcmp(self._bld.S.name, mx_ptr) != 0:
            with nogil:
                status = libhmmer.p7_builder.p7_builder_LoadScoreSystem(
                    self._bld,
                    mx_ptr,
                    self.popen,
                    self.pextend,
                    background._bg
                )
            if status == libeasel.eslEINVAL:
                msg = self._bld.errbuf.decode('utf-8', 'ignore')
                raise RuntimeError(f"failed to set single sequence score system: {msg}")
            elif status != libeasel.eslOK:
                raise UnexpectedError(status, "p7_builder_LoadScoreSystem")
        else:
            self._bld.popen = self.popen
            self._bld.pextend = self.pextend
820
821
822

        # build HMM and profiles
        with nogil:
823
824
825
826
827
828
829
830
831
            status = libhmmer.p7_builder.p7_SingleBuilder(
                self._bld,
                sequence._sq,
                background._bg,
                &hmm._hmm, # HMM
                NULL, # traceback
                &profile._gm, # profile,
                &opti._om, # optimized profile
            )
832
        if status == libeasel.eslOK:
833
            hmm.command_line = " ".join(sys.argv)
834
        elif status == libeasel.eslEINVAL:
835
836
            msg = self._bld.errbuf.decode("utf-8", "replace")
            raise ValueError("Could not build HMM: {}".format(msg))
837
838
839
        else:
            raise UnexpectedError(status, "p7_SingleBuilder")

840
841
842
        # return newly built HMM, profile and optimized profile
        return hmm, profile, opti

843
844
845
846
847
    cpdef tuple build_msa(
        self,
        DigitalMSA msa,
        Background background,
    ):
848
849
850
851
852
853
854
855
856
857
        """build_msa(self, msa, background)\n--

        Build a new HMM from ``msa`` using the builder configuration.

        Arguments:
            msa (`~pyhmmer.easel.DigitalMSA`): A multiple sequence
                alignment in digital mode to build a HMM with.
            background (`pyhmmer.plan7.background`): The background model
                to use to create the HMM.

858
859
860
861
        Returns:
            (`HMM`, `Profile`, `OptimizedProfile`): A tuple containing the
            new HMM as well as profiles to be used directly in a `Pipeline`.

862
        Raises:
863
864
            `~pyhmmer.errors.AlphabetMismatch`: When either ``msa`` or
                ``background`` have the wrong alphabet for this builder.
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
            `ValueError`: When the HMM cannot be created successfully
                from the input; the error message contains more details.

        Caution:
            HMMER requires that every HMM has a name, so the `Builder` will
            attempt to use the name of the sequence to name the HMM. Passing
            an MSA without a name will result in an error::

                >>> alphabet = easel.Alphabet.amino()
                >>> msa = easel.TextMSA(sequences=[
                ...   easel.TextSequence(name=b"ustiA", sequence="YAIG"),
                ...   easel.TextSequence(name=b"ustiB", sequence="YVIG")
                ... ])
                >>> builder = plan7.Builder(alphabet)
                >>> background = plan7.Background(alphabet)
                >>> builder.build_msa(msa.digitize(alphabet), background)
                Traceback (most recent call last):
                  ...
                ValueError: Could not build HMM: Unable to name the HMM.
884

885
886
        .. versionadded:: 0.3.0

887
888
889
        .. versionchanged:: 0.4.6
           Sets the `HMM.creation_time` attribute with the current time.

890
891
892
        """
        assert self._bld != NULL

893
894
895
896
        cdef int              status
        cdef HMM              hmm     = HMM.__new__(HMM)
        cdef Profile          profile = Profile.__new__(Profile)
        cdef OptimizedProfile opti    = OptimizedProfile.__new__(OptimizedProfile)
897
        cdef str              msg
898
        cdef size_t           k
899

900
901
        # unset builder probabilities and score system if they were set
        # by a call to `build`, since we won't be needing them here
902
        self._bld.popen = self._bld.pextend = -1
903
904
905
906
907
908
        if self._bld.S != NULL:
            libeasel.scorematrix.esl_scorematrix_Destroy(self._bld.S)
            self._bld.S = NULL
        if self._bld.Q != NULL:
            libeasel.dmatrix.esl_dmatrix_Destroy(self._bld.Q)
            self._bld.Q = NULL
909
910
911

        # reseed RNG used by the builder if needed
        if self._bld.do_reseeding:
912
            self.randomness._seed(self.seed)
913
914
915

        # check alphabet and assign it to newly created objects
        hmm.alphabet = profile.alphabet = opti.alphabet = self.alphabet
916
        if not self.alphabet._eq(background.alphabet):
917
            raise AlphabetMismatch(self.alphabet, background.alphabet)
918
        if not self.alphabet._eq(msa.alphabet):
919
            raise AlphabetMismatch(self.alphabet, msa.alphabet)
920

921
        # build HMM
922
923
924
925
926
927
928
929
930
931
932
933
934
        with nogil:
            # build HMM and profiles
            status = libhmmer.p7_builder.p7_Builder(
                self._bld,
                msa._msa,
                background._bg,
                &hmm._hmm, # HMM
                NULL, # traceback
                &profile._gm, # profile,
                &opti._om, # optimized profile
                NULL # modified msa
            )
        if status == libeasel.eslOK:
935
            hmm.command_line = " ".join(sys.argv)
936
        elif status == libeasel.eslEINVAL:
937
938
            msg = self._bld.errbuf.decode("utf-8", "replace")
            raise ValueError("Could not build HMM: {}".format(msg))
939
940
        else:
            raise UnexpectedError(status, "p7_Builder")
941

942
943
944
        # return newly built HMM, profile and optimized profile
        return hmm, profile, opti

945
    cpdef Builder copy(self):
946
947
948
949
        """copy(self)\n--

        Create a duplicate `Builder` instance with the same arguments.

950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
        """
        assert self._bld != NULL
        return Builder(
            self.alphabet,
            architecture=self.architecture,
            weighting=self.weighting,
            effective_number=self.effective_number,
            prior_scheme=self.prior_scheme,
            symfrac=self._bld.symfrac,
            fragthresh=self._bld.fragthresh,
            wid=self._bld.wid,
            esigma=self._bld.esigma,
            eid=self._bld.eid,
            EmL=self._bld.EmL,
            EmN=self._bld.EmN,
            EvL=self._bld.EvL,
            EvN=self._bld.EvN,
967
968
            EfL=self._bld.EfL,
            EfN=self._bld.EfN,
969
            Eft=self._bld.Eft,
970
            seed=self.seed,
971
            ere=self._bld.re_target,
972
            popen=self.popen,
973
974
975
            pextend=self.pextend,
            window_length=self.window_length,
            window_beta=self.window_beta,
976
977
        )

978

979
@cython.freelist(8)
980
@cython.no_gc_clear
981
cdef class Cutoffs:
982
    """A mutable view over the score cutoffs of a `HMM` or a `Profile`.
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000

    .. versionadded:: 0.4.6

    """

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self):
        self._owner = None
        self._flags = NULL
        self._cutoffs = NULL
        self._is_profile = True

    def __init__(self, object owner):
        cdef str ty
        if isinstance(owner, Profile):
            self._cutoffs = &(<Profile> owner)._gm.cutoff
            self._flags = NULL
For faster browsing, not all history is shown. View entire blame