plan7.pyx 255 KB
Newer Older
1
2
# coding: utf-8
# cython: language_level=3, linetrace=True
3
4
5
6
7
8
9
10
11
"""High-level interface to the Plan7 data model.

Plan7 is the model architecture used by HMMER since HMMER2.

See Also:
    Details about the Plan 7 architecture in the `HMMER documentation
    <http://www.csb.yale.edu/userguides/seq/hmmer/docs/node11.html>`_.

"""
12

13
14
# --- C imports --------------------------------------------------------------

15
cimport cython
16
from cpython.bytes cimport PyBytes_FromStringAndSize
17
from cpython.list cimport PyList_New, PyList_SET_ITEM
18
from cpython.ref cimport PyObject
19
from cpython.exc cimport PyErr_Clear
20
21
from cpython.unicode cimport PyUnicode_DecodeASCII
from libc.math cimport exp, ceil
22
from libc.stddef cimport ptrdiff_t
23
from libc.stdio cimport printf
24
from libc.stdlib cimport calloc, malloc, realloc, free
25
from libc.stdint cimport uint8_t, uint32_t, int64_t
26
from libc.stdio cimport fprintf, FILE, stdout, fclose
27
from libc.string cimport memset, memcpy, memmove, strdup, strndup, strlen, strcmp, strncpy
28
29
from libc.time cimport ctime, strftime, time, time_t, tm, localtime_r
from unicode cimport PyUnicode_DATA, PyUnicode_KIND, PyUnicode_READ, PyUnicode_READY, PyUnicode_GET_LENGTH
30

31
cimport libeasel
32
cimport libeasel.sq
33
cimport libeasel.alphabet
34
cimport libeasel.dmatrix
35
cimport libeasel.fileparser
36
cimport libeasel.random
37
cimport libeasel.scorematrix
38
cimport libeasel.getopts
39
cimport libeasel.vec
40
cimport libhmmer
41
cimport libhmmer.modelconfig
42
cimport libhmmer.modelstats
43
cimport libhmmer.p7_alidisplay
44
cimport libhmmer.p7_hmm
45
cimport libhmmer.p7_builder
46
cimport libhmmer.p7_bg
47
cimport libhmmer.p7_domaindef
48
cimport libhmmer.p7_hit
49
50
cimport libhmmer.p7_hmmfile
cimport libhmmer.p7_pipeline
51
cimport libhmmer.p7_prior
52
cimport libhmmer.p7_profile
53
cimport libhmmer.p7_scoredata
54
cimport libhmmer.p7_tophits
55
56
cimport libhmmer.p7_trace
cimport libhmmer.tracealign
57
from libeasel cimport eslERRBUFSIZE, eslCONST_LOG2R
58
from libeasel.alphabet cimport ESL_ALPHABET, esl_alphabet_Create, esl_abc_ValidateType
59
60
from libeasel.getopts cimport ESL_GETOPTS, ESL_OPTIONS
from libeasel.sq cimport ESL_SQ
61
from libeasel.keyhash cimport ESL_KEYHASH
62
from libeasel.fileparser cimport ESL_FILEPARSER
63
from libhmmer cimport p7_LOCAL, p7_EVPARAM_UNSET, p7_CUTOFF_UNSET, p7_NEVPARAM, p7_NCUTOFFS, p7_offsets_e, p7_cutoffs_e, p7_evparams_e
64
from libhmmer.logsum cimport p7_FLogsumInit
65
from libhmmer.p7_builder cimport P7_BUILDER, p7_archchoice_e, p7_wgtchoice_e, p7_effnchoice_e
66
from libhmmer.p7_hmm cimport p7H_NTRANSITIONS, p7H_TC, p7H_GA, p7H_NC, p7H_MAP
67
from libhmmer.p7_hmmfile cimport p7_hmmfile_formats_e
68
from libhmmer.p7_hit cimport p7_hitflags_e, P7_HIT
69
from libhmmer.p7_alidisplay cimport P7_ALIDISPLAY
70
from libhmmer.p7_pipeline cimport P7_PIPELINE, p7_pipemodes_e, p7_zsetby_e, p7_strands_e, p7_complementarity_e
71
from libhmmer.p7_profile cimport p7_LOCAL, p7_GLOCAL, p7_UNILOCAL, p7_UNIGLOCAL
72
from libhmmer.p7_trace cimport P7_TRACE, p7t_statetype_e
73

74
IF HMMER_IMPL == "VMX":
75
    from libhmmer.impl_vmx cimport p7_oprofile, p7_omx, impl_Init
76
    from libhmmer.impl_vmx.io cimport p7_oprofile_Write, p7_oprofile_ReadMSV, p7_oprofile_ReadRest
77
78
79
80
81
82
83
    from libhmmer.impl_vmx.p7_oprofile cimport (
        P7_OPROFILE,
        p7O_NQB,
        p7_oprofile_Compare,
        p7_oprofile_Dump,
        p7_oprofile_Sizeof,
    )
84
ELIF HMMER_IMPL == "SSE":
85
    from libhmmer.impl_sse cimport p7_oprofile, p7_omx, impl_Init, p7_SSVFilter, p7O_EXTRA_SB
86
    from libhmmer.impl_sse.io cimport p7_oprofile_Write, p7_oprofile_ReadMSV, p7_oprofile_ReadRest
87
88
89
90
91
92
93
    from libhmmer.impl_sse.p7_oprofile cimport (
        P7_OPROFILE,
        p7O_NQB,
        p7_oprofile_Compare,
        p7_oprofile_Dump,
        p7_oprofile_Sizeof,
    )
94

95
96
97
98
from .easel cimport (
    Alphabet,
    Sequence,
    DigitalSequence,
99
    KeyHash,
100
101
102
103
    MSA,
    TextMSA,
    DigitalMSA,
    VectorF,
104
    VectorU8,
105
106
    MatrixF,
    MatrixU8,
107
)
108
from .reexports.p7_tophits cimport p7_tophits_Reuse
109
110
111
112
113
114
115
116
117
118
119
from .reexports.p7_hmmfile cimport (
    read_asc20hmm,
    read_asc30hmm,
    read_bin30hmm,
    v3a_magic,
    v3b_magic,
    v3c_magic,
    v3d_magic,
    v3e_magic,
    v3f_magic
)
120

121
122
123
124
125
IF UNAME_SYSNAME == "Linux":
    include "fileobj/linux.pxi"
ELIF UNAME_SYSNAME == "Darwin" or UNAME_SYSNAME.endswith("BSD"):
    include "fileobj/bsd.pxi"

126
127
# --- Python imports ---------------------------------------------------------

128
import array
129
import collections.abc
130
import datetime
131
import errno
132
import math
133
import io
134
import itertools
135
136
import os
import sys
137
138
import warnings

139
from .errors import AllocationError, UnexpectedError, AlphabetMismatch
140
from .utils import peekable
141
142


143
144
# --- Constants --------------------------------------------------------------

145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
cdef dict BUILDER_ARCHITECTURE_STRATEGY = {
    "fast": p7_archchoice_e.p7_ARCH_FAST,
    "hand": p7_archchoice_e.p7_ARCH_HAND,
}

cdef dict BUILDER_WEIGHTING_STRATEGY = {
    "pb": p7_wgtchoice_e.p7_WGT_PB,
    "gsc": p7_wgtchoice_e.p7_WGT_GSC,
    "blosum": p7_wgtchoice_e.p7_WGT_BLOSUM,
    "none": p7_wgtchoice_e.p7_WGT_NONE,
    "given": p7_wgtchoice_e.p7_WGT_GIVEN,
}

cdef dict BUILDER_EFFECTIVE_STRATEGY = {
    "entropy": p7_effnchoice_e.p7_EFFN_ENTROPY,
    "exp": p7_effnchoice_e.p7_EFFN_ENTROPY_EXP,
    "clust": p7_effnchoice_e.p7_EFFN_CLUST,
    "none": p7_effnchoice_e.p7_EFFN_NONE,
}

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
cdef dict HMM_FILE_FORMATS = {
    "2.0": p7_hmmfile_formats_e.p7_HMMFILE_20,
    "3/a": p7_hmmfile_formats_e.p7_HMMFILE_3a,
    "3/b": p7_hmmfile_formats_e.p7_HMMFILE_3b,
    "3/c": p7_hmmfile_formats_e.p7_HMMFILE_3c,
    "3/d": p7_hmmfile_formats_e.p7_HMMFILE_3d,
    "3/e": p7_hmmfile_formats_e.p7_HMMFILE_3e,
    "3/f": p7_hmmfile_formats_e.p7_HMMFILE_3f,
}

cdef dict HMM_FILE_MAGIC = {
    v3a_magic: p7_hmmfile_formats_e.p7_HMMFILE_3a,
    v3b_magic: p7_hmmfile_formats_e.p7_HMMFILE_3b,
    v3c_magic: p7_hmmfile_formats_e.p7_HMMFILE_3c,
    v3d_magic: p7_hmmfile_formats_e.p7_HMMFILE_3d,
    v3e_magic: p7_hmmfile_formats_e.p7_HMMFILE_3e,
    v3f_magic: p7_hmmfile_formats_e.p7_HMMFILE_3f,
}

184
185
186
187
188
cdef dict PIPELINE_BIT_CUTOFFS = {
    "gathering": libhmmer.p7_hmm.p7H_GA,
    "noise": libhmmer.p7_hmm.p7H_NC,
    "trusted": libhmmer.p7_hmm.p7H_TC,
}
189

190
# --- Cython classes ---------------------------------------------------------
191

192
193

cdef class Alignment:
194
195
196
197
198
    """An alignment of a sequence to a profile.

    Attributes:
        domain (`Domain`): The domain this alignment corresponds to.

199
    """
200
201
202
203
204
205
206

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self, Domain domain):
        self.domain = domain
        self._ad = domain._dom.ad

207
208
209
    def __len__(self):
        return self.hmm_to - self.hmm_from

210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    def __str__(self):
        assert self._ad != NULL

        cdef int    status
        cdef object buffer = io.BytesIO()
        cdef FILE*  fp     = fopen_obj(buffer, "w")

        try:
            status = libhmmer.p7_alidisplay.p7_nontranslated_alidisplay_Print(
                fp,
                self._ad,
                0,
                -1,
                False,
            )
            if status == libeasel.eslEWRITE:
                raise OSError("Failed to write alignment")
            elif status != libeasel.eslOK:
                raise UnexpectedError(status, "p7_alidisplay_Print")
        finally:
            fclose(fp)

        return buffer.getvalue().decode("ascii")

234
235
236
237
238
239
    cpdef VectorU8 __getstate__(self):
        cdef int       status
        cdef uint32_t  n      = 0
        cdef uint32_t  nalloc = 0
        cdef uint8_t*  buffer = NULL
        cdef VectorU8  vec
240

241
242
243
244
        with nogil:
            status = libhmmer.p7_alidisplay.p7_alidisplay_Serialize(self._ad, &buffer, &n, &nalloc)
        if status != libeasel.eslOK:
            raise UnexpectedError(status, "p7_alidisplay_Serialize")
245

246
247
248
249
250
        vec = VectorU8.__new__(VectorU8)
        vec._owner = None
        vec._n = vec._shape[0] = nalloc
        vec._data = <void*> buffer
        return vec
251

252
253
254
    cpdef object __setstate__(self, uint8_t[::1] state):
        cdef int      status
        cdef uint32_t offset = 0
255

256
257
258
259
        if self._ad == NULL:
            self.domain._dom.ad = self._ad = libhmmer.p7_alidisplay.p7_alidisplay_Create_empty()
            if self._ad == NULL:
                raise AllocationError("P7_ALIDISPLAY", sizeof(P7_ALIDISPLAY))
260

261
262
263
264
        with nogil:
            status = libhmmer.p7_alidisplay.p7_alidisplay_Deserialize(&state[0], &offset, self._ad)
        if status != libeasel.eslOK:
            raise UnexpectedError(status, "p7_alidisplay_Deserialize")
265

266
267
268
269
    # --- Properties ---------------------------------------------------------

    @property
    def hmm_from(self):
270
271
        """`int`: The start coordinate of the alignment in the query HMM.
        """
272
        assert self._ad != NULL
273
274
275
276
        return self._ad.hmmfrom

    @property
    def hmm_to(self):
277
278
        """`int`: The end coordinate of the alignment in the query HMM.
        """
279
        assert self._ad != NULL
280
281
282
283
        return self._ad.hmmto

    @property
    def hmm_name(self):
284
285
        """`bytes`: The name of the query HMM.
        """
286
287
        assert self._ad != NULL
        assert self._ad.hmmname != NULL
288
289
        return <bytes> self._ad.hmmname

290
291
    @property
    def hmm_accession(self):
292
        """`bytes`: The accession of the query, or its name if it has none.
293
294
295

        .. versionadded:: 0.1.4

296
        """
297
298
        assert self._ad != NULL
        assert self._ad.hmmacc != NULL
299
300
        return <bytes> self._ad.hmmacc

301
302
    @property
    def hmm_sequence(self):
303
304
        """`str`: The sequence of the query HMM in the alignment.
        """
305
        assert self._ad != NULL
306
307
308
309
        return self._ad.model.decode('ascii')

    @property
    def target_from(self):
310
311
        """`int`: The start coordinate of the alignment in the target sequence.
        """
312
        assert self._ad != NULL
313
314
315
316
        return self._ad.sqfrom

    @property
    def target_name(self):
317
318
        """`bytes`: The name of the target sequence.
        """
319
320
        assert self._ad != NULL
        assert self._ad.sqname != NULL
321
322
323
324
        return <bytes> self._ad.sqname

    @property
    def target_sequence(self):
325
326
        """`str`: The sequence of the target sequence in the alignment.
        """
327
        assert self._ad != NULL
328
329
330
331
        return self._ad.aseq.decode('ascii')

    @property
    def target_to(self):
332
333
        """`int`: The end coordinate of the alignment in the target sequence.
        """
334
        assert self._ad != NULL
335
336
337
338
        return self._ad.sqto

    @property
    def identity_sequence(self):
339
340
        """`str`: The identity sequence between the query and the target.
        """
341
        assert self._ad != NULL
342
343
344
        return self._ad.mline.decode('ascii')


345
346
cdef class Background:
    """The null background model of HMMER.
347
348
349
350
351
352

    Attributes:
        alphabet (`~pyhmmer.easel.Alphabet`): The alphabet of the
            backgound model.
        uniform (`bool`): Whether or not the null model has been created
            with uniform frequencies.
353
354
        residue_frequencies (`~pyhmmer.easel.VectorF`): The null1 background
            residue frequencies.
355

356
    .. versionchanged:: 0.4.0
357
358
       Added the `residue_frequencies` attribute.

359
360
361
362
363
364
    """

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self):
        self._bg = NULL
365
        self._L  = 350
366
        self.alphabet = None
367
        self.residue_frequencies = None
368
        self.uniform = False
369
370

    def __init__(self, Alphabet alphabet, bint uniform=False):
371
372
373
        """__init__(self, alphabet, uniform=False)\n--

        Create a new background model for the given ``alphabet``.
374
375
376
377
378
379
380
381

        Arguments:
          alphabet (`pyhmmer.easel.Alphabet`): The alphabet to create the
              background model with.
          uniform (`bool`): Whether or not to create the null model with
              uniform frequencies. Defaults to `False`.

        """
382
        # store the alphabet so it's not deallocated
383
        self.alphabet = alphabet
384
        # store whether or not the null model has uniform frequencies
385
        self.uniform = uniform
386
        # create the background profile
387
388
389
390
391
        with nogil:
            if uniform:
                self._bg = libhmmer.p7_bg.p7_bg_CreateUniform(alphabet._abc)
            else:
                self._bg = libhmmer.p7_bg.p7_bg_Create(alphabet._abc)
392
        if self._bg == NULL:
393
            raise AllocationError("P7_BG", sizeof(P7_BG))
394
        # expose the residue frequencies as the `residue_frequencies` attribute
395
396
397
398
399
        self.residue_frequencies = VectorF.__new__(VectorF)
        self.residue_frequencies._data = &(self._bg.f[0])
        self.residue_frequencies._owner = self
        self.residue_frequencies._n = self.residue_frequencies._shape[0] = self.alphabet.K

400
401
402
403
404
405
    def __dealloc__(self):
        libhmmer.p7_bg.p7_bg_Destroy(self._bg)

    def __copy__(self):
        return self.copy()

406
    # --- Properties ---------------------------------------------------------
407
408

    @property
409
    def L(self):
410
411
        """`int`: The mean of the null model length distribution, in residues.
        """
412
        assert self._bg != NULL
413
        return self._L
414

415
416
    @L.setter
    def L(self, int L):
417
418
        assert self._bg != NULL

419
420
421
422
        cdef int    status
        cdef P7_BG* bg     = self._bg

        with nogil:
423
            status = libhmmer.p7_bg.p7_bg_SetLength(bg, L)
424
425
        if status != libeasel.eslOK:
            raise UnexpectedError(status, "p7_bg_SetLength")
426
427
428
429
430
        self._L = L

    @property
    def transition_probability(self):
        r"""`float`: The null1 transition probability (:math:`\frac{L}{L+1}`).
431

432
        .. versionadded:: 0.4.0
433

434
435
436
437
438
439
        """
        assert self._bg != NULL
        return self._bg.p1

    @property
    def omega(self):
440
        """`float`: The *prior* on *null2*/*null3*.
441

442
        .. versionadded:: 0.4.0
443

444
445
446
447
448
449
450
451
        """
        assert self._bg != NULL
        return self._bg.omega

    @omega.setter
    def omega(self, float omega):
        assert self._bg != NULL
        self._bg.omega = omega
452
453
454
455

    # --- Methods ------------------------------------------------------------

    cpdef Background copy(self):
456
457
458
459
        """copy(self)\n--

        Create a copy of the null model with the same parameters.

460
        """
461
462
        assert self._bg != NULL

463
        cdef Background new = Background.__new__(Background)
464
465
        with nogil:
            new._bg = libhmmer.p7_bg.p7_bg_Clone(self._bg)
466
        if new._bg == NULL:
467
            raise AllocationError("P7_BG", sizeof(P7_BG))
468
469
470
471
472
473
474

        new.alphabet = self.alphabet
        new.uniform = self.uniform
        new.residue_frequencies = VectorF.__new__(VectorF)
        new.residue_frequencies._data = &(new._bg.f[0])
        new.residue_frequencies._owner = new
        new.residue_frequencies._n = new.residue_frequencies._shape[0] = new.alphabet.K
475
476
477
        return new


478
479
cdef class Builder:
    """A factory for constructing new HMMs from raw sequences.
480

481
482
483
    Attributes:
        alphabet (`~pyhmmer.easel.Alphabet`): The alphabet the builder is
            configured to use to convert sequences to HMMs.
484
485
        randomness (`~pyhmmer.easel.Randomness`): The random number generator
            being used by the builder.
486
487
488
489
490
491
        score_matrix (`str`): The name of the substitution matrix used to
            build HMMs for single sequences.
        popen (`float`): The *gap open* probability to use when building
            HMMs from single sequences.
        pextend (`float`): The *gap extend* probability to use when building
            HMMs from single sequences.
492

493
494
    .. versionadded:: 0.2.0

495
    .. versionchanged:: 0.4.2
496
       Added the `~Builder.randomness` attribute.
497

498
499
    """

500
501
502
    _ARCHITECTURE_STRATEGY = dict(BUILDER_ARCHITECTURE_STRATEGY)
    _WEIGHTING_STRATEGY = dict(BUILDER_WEIGHTING_STRATEGY)
    _EFFECTIVE_STRATEGY = dict(BUILDER_EFFECTIVE_STRATEGY)
503
504
505
506
507
508
509
510
511
512
513
514
515

    # --- Magic methods ------------------------------------------------------

    def __cinit__(self):
        self._bld = NULL
        self.alphabet = None

    def __init__(
        self,
        Alphabet alphabet,
        *,
        str architecture="fast",
        str weighting="pb",
516
        object effective_number="entropy",
517
518
519
        str prior_scheme="alphabet",
        float symfrac=0.5,
        float fragthresh=0.5,
520
521
522
        double wid=0.62,
        double esigma=45.0,
        double eid=0.62,
523
524
525
526
527
528
        int EmL=200,
        int EmN=200,
        int EvL=200,
        int EvN=200,
        int EfL=100,
        int EfN=200,
529
        double Eft=0.04,
530
        uint32_t seed=42,
531
        object ere=None,
532
533
        object popen=None,
        object pextend=None,
534
        object score_matrix=None,
535
536
        object window_length=None,
        object window_beta=None,
537
    ):
538
        """__init__(self, alphabet, *, architecture="fast", weighting="pb", effective_number="entropy", prior_scheme="alpha", symfrac=0.5, fragthresh=0.5, wid=0.62, esigma=45.0, eid=0.62, EmL=200, EmN=200, EvL=200, EvN=200, EfL=100, EfN=200, Eft=0.04, seed=42, ere=None, popen=None, pextend=None, window_length=None, window_beta=None)\n--
539
540
541
542
543
544
545

        Create a new sequence builder with the given configuration.

        Arguments:
            alphabet (`~pyhmmer.easel.Alphabet`): The alphabet the builder
                expects the sequences to be in.

546
        Keyword Arguments:
547
548
549
550
551
552
553
554
555
556
557
558
            architecture (`str`): The algorithm to use to determine the
                model architecture, either ``"fast"`` (the default), or
                ``"hand"``.
            weighting (`str`): The algorithm to use for relative sequence
                weighting, either ``"pb"`` (the default), ``"gsc"``,
                ``"blosum"``, ``"none"``, or ``"given"``.
            effective_number (`str`, `int`, or `float`): The algorithm to
                use to determine the effective sequence number, either
                ``"entropy"`` (the default), ``"exp"``, ``"clust"``, ``"none"``.
                A number can also be given directly to set the effective
                sequence number manually.
            prior_scheme (`str`): The choice of mixture Dirichlet prior when
559
560
                parameterizing  from counts, either ``"laplace"``
                or ``"alphabet"`` (the default).
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
            symfrac (`float`): The residue occurrence threshold for fast
                architecture determination.
            fragthresh (`float`): A threshold such that a sequence is called
                a fragment when :math:`L \\le fragthresh \times alen`.
            wid (`double`): The percent identity threshold for BLOSUM relative
                weighting.
            esigma (`float`): The minimum total relative entropy parameter
                for effective number entropy weights.
            eid (`float`): The percent identity threshold for effective
                number clustering.
            EmL (`int`): The length of sequences generated for MSV fitting.
            EmN (`int`): The number of sequences generated for MSV fitting.
            EvL (`int`): The lenght of sequences generated for Viterbi fitting.
            EvN (`int`): The number of sequences generated for Viterbi fitting.
            EfL (`int`): The lenght of sequences generated for Forward fitting.
            EfN (`int`): The number of sequences generated for Forward fitting.
            Eft (`float`): The tail mass used for Forward fitting.
            seed (`int`): The seed to use to initialize the internal random
                number generator. If ``0`` is given, an arbitrary seed will
                be chosen based on the current time.
            ere (`double`, optional): The relative entropy target for effective
                number weighting, or `None`.
583
584
585
            popen (`float`, optional): The *gap open* probability to use
                when building HMMs from single sequences. The default value
                depends on the alphabet: *0.02* for proteins,
586
                *0.03125* for nucleotides.
587
588
589
590
591
592
593
594
595
            pextend (`float`, optional): The *gap extend* probability to use
                when building HMMs from single sequences. Default depends on
                the alphabet: *0.4* for proteins, *0.75* for nucleotides.
            score_matrix (`str`, optional): The name of the score matrix to
                use when building HMMs from single sequences. The only
                allowed value for nucleotide alphabets is *DNA1*. For
                proteins, *PAM30*, *PAM70*, *PAM120*, *PAM240*, *BLOSUM45*,
                *BLOSUM50*, *BLOSUM62* (the default), *BLOSUM80* or
                *BLOSUM90* can be used.
596
597
            window_length (`float`, optional): The window length for
                nucleotide sequences, essentially the max expected hit length.
598
                *If given, takes precedence over* ``window_beta``.
599
600
            window_beta (`float`, optional): The tail mass at which window
                length is determined for nucleotide sequences.
601

602
        """
603
        # extract alphabet and create raw P7_BUILDER object
604
        self.alphabet = alphabet
605
        abcty = alphabet._abc.type
606
607
        with nogil:
            self._bld = libhmmer.p7_builder.p7_builder_Create(NULL, alphabet._abc)
608
        if self._bld == NULL:
609
            raise AllocationError("P7_BG", sizeof(P7_BG))
610

611
612
613
614
615
616
617
618
        # create a Randomness object exposing the internal RNG
        self.randomness = Randomness.__new__(Randomness)
        self.randomness._owner = self
        self.randomness._rng = self._bld.r

        # store the seed given at initialization and reseed the RNG
        self.seed = seed

619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
        # set numeric parameters
        self._bld.symfrac = symfrac
        self._bld.fragthresh = fragthresh
        self._bld.wid = wid
        self._bld.esigma = esigma
        self._bld.eid = eid
        self._bld.EmL = EmL
        self._bld.EmN = EmN
        self._bld.EvL = EvL
        self._bld.EvN = EvN
        self._bld.EfL = EfL
        self._bld.EfN = EfN
        self._bld.Eft = Eft

        # set the architecture strategy
634
        self.architecture = architecture
635
        _arch = BUILDER_ARCHITECTURE_STRATEGY.get(architecture)
636
637
638
639
640
641
        if _arch is not None:
            self._bld.arch_strategy = _arch
        else:
            raise ValueError(f"Invalid value for 'architecture': {architecture}")

        # set the weighting strategy
642
        self.weighting = weighting
643
        _weighting = BUILDER_WEIGHTING_STRATEGY.get(weighting)
644
645
646
647
648
649
        if _weighting is not None:
            self._bld.wgt_strategy = _weighting
        else:
            raise ValueError(f"Invalid value for 'weighting': {weighting}")

        # set the effective sequence number strategy
650
        self.effective_number = effective_number
651
652
653
        if isinstance(effective_number, (int, float)):
            self._bld.effn_strategy = p7_effnchoice_e.p7_EFFN_SET
            self._bld.eset = effective_number
654
        elif isinstance(effective_number, str):
655
            _effn = BUILDER_EFFECTIVE_STRATEGY.get(effective_number)
656
657
658
659
            if _effn is not None:
                self._bld.effn_strategy = _effn
            else:
                raise ValueError(f"Invalid value for 'effective_number': {effective_number}")
660
661
662
        else:
            ty = type(effective_number).__name__
            raise TypeError(f"Invalid type for 'effective_number': {ty}")
663

664
665
666
667
668
669
670
671
672
673
        # set the RE target if given one, otherwise the default
        # is alphabet dependent
        if ere is not None:
            self._bld.re_target = ere
        elif alphabet.is_nucleotide():
            self._bld.re_target = libhmmer.p7_ETARGET_DNA
        elif alphabet.is_amino():
            self._bld.re_target = libhmmer.p7_ETARGET_AMINO
        else:
            self._bld.re_target = libhmmer.p7_ETARGET_OTHER
674
675

        # set the prior scheme
676
        self.prior_scheme = prior_scheme
677
678
679
680
681
682
683
684
685
        if prior_scheme is None:
            self._bld.prior = NULL
        elif prior_scheme == "laplace":
            self._bld.prior = libhmmer.p7_prior.p7_prior_CreateLaplace(self.alphabet._abc)
        elif prior_scheme == "alphabet":
            if alphabet.is_amino():
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateAmino()
            elif alphabet.is_nucleotide():
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateNucleic()
686
            else:
687
688
689
                self._bld.prior = libhmmer.p7_prior.p7_prior_CreateLaplace(self.alphabet._abc)
        else:
            raise ValueError("Invalid value for 'prior_scheme': {prior_scheme}")
690

691
        # set the gap probabilities and score matrix using alphabet-specific
692
        # defaults (this is only used when building a HMM for a single sequence)
693
        if alphabet.is_nucleotide():
694
            self.score_matrix = "DNA1" if score_matrix is None else score_matrix
695
696
697
            self.popen = 0.03125 if popen is None else popen
            self.pextend = 0.75 if pextend is None else pextend
        else:
698
            self.score_matrix = "BLOSUM62" if score_matrix is None else score_matrix
699
700
701
            self.popen = 0.02 if popen is None else popen
            self.pextend = 0.4 if pextend is None else pextend

702
703
704
705
706
707
        # set the window options
        self.window_length = window_length
        if window_beta is None:
            self.window_beta = libhmmer.p7_builder.p7_DEFAULT_WINDOW_BETA
        else:
            self.window_beta = window_beta
708

709
710
711
    def __dealloc__(self):
        libhmmer.p7_builder.p7_builder_Destroy(self._bld)

712
713
714
    def __copy__(self):
        return self.copy()

715
716
717
718
    # --- Properties ---------------------------------------------------------

    @property
    def seed(self):
719
720
721
722
723
        """`int`: The seed given at builder initialization.

        Setting this attribute to a different value will cause the internal
        random number generator to be reseeded immediately.

724
        .. versionchanged:: 0.4.2
725
           Avoid shadowing initial null seeds given on builder initialization.
726

727
        """
728
        return self._seed
729
730
731

    @seed.setter
    def seed(self, uint32_t seed):
732
        self._seed = seed
733
        self._bld.do_reseeding = seed != 0
734
        self.randomness._seed(seed)
735

736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
    @property
    def window_length(self):
        """`int` or `None`: The window length for nucleotide sequences.
        """
        assert self._bld != NULL
        return None if self._bld.w_len == -1 else self._bld.w_len

    @window_length.setter
    def window_length(self, object window_length):
        assert self._bld != NULL
        if window_length is None:
            self._bld.w_len = -1
        elif window_length > 3:
            self._bld.w_len = window_length
        else:
            raise ValueError(f"Invalid window length: {window_length!r}")

    @property
    def window_beta(self):
        """`float`: The tail mass at which window length is determined.
        """
        assert self._bld != NULL
        return self._bld.w_beta

    @window_beta.setter
    def window_beta(self, double window_beta):
        assert self._bld != NULL
        if window_beta > 1 or window_beta < 0:
            raise ValueError(f"Invalid window tail mass: {window_beta!r}")
        self._bld.w_beta = window_beta

767
768
    # --- Methods ------------------------------------------------------------

769
770
771
772
773
    cpdef tuple build(
        self,
        DigitalSequence sequence,
        Background background,
    ):
774
        """build(self, sequence, background)\n--
775
776

        Build a new HMM from ``sequence`` using the builder configuration.
777
778
779
780

        Arguments:
            sequence (`~pyhmmer.easel.DigitalSequence`): A single biological
                sequence in digital mode to build a HMM with.
781
782
            background (`pyhmmer.plan7.background`): The background model
                to use to create the HMM.
783

784
785
786
787
        Returns:
            (`HMM`, `Profile`, `OptimizedProfile`): A tuple containing the
            new HMM as well as profiles to be used directly in a `Pipeline`.

788
        Raises:
789
790
            `~pyhmmer.errors.AlphabetMismatch`: When either ``sequence`` or
                ``background`` have the wrong alphabet for this builder.
791
792
            `ValueError`: When the HMM cannot be created successfully
                from the input; the error message contains more details.
793

794
795
796
797
798
799
800
801
802
803
804
        Hint:
            The score matrix and the gap probabilities used here can be set
            when initializing the builder, or changed by setting a new value
            to the right attribute::

                >>> alphabet = easel.Alphabet.amino()
                >>> background = plan7.Background(alphabet)
                >>> builder = plan7.Builder(alphabet, popen=0.05)
                >>> builder.score_matrix = "BLOSUM90"
                >>> hmm, _, _ = builder.build(proteins[0], background)

805
806
807
        .. versionchanged:: 0.4.6
           Sets the `HMM.creation_time` attribute with the current time.

808
809
810
        .. versionchanged:: 0.4.10
           The score system is now cached between calls to `Builder.build`.

811
        """
812
813
        assert self._bld != NULL

814
815
816
        cdef int              status
        cdef HMM              hmm     = HMM.__new__(HMM)
        cdef Profile          profile = Profile.__new__(Profile)
817
        cdef OptimizedProfile opti    = OptimizedProfile.__new__(OptimizedProfile)
818
        cdef bytes            mx      = self.score_matrix.encode('utf-8')
819
        cdef char*            mx_ptr  = <char*> mx
820
        cdef str              msg
821

822
823
        # reseed RNG used by the builder if needed
        if self._bld.do_reseeding:
824
            self.randomness._seed(self.seed)
825

826
        # check alphabet and assign it to newly created objects
827
        hmm.alphabet = profile.alphabet = opti.alphabet = self.alphabet
828
        if not self.alphabet._eq(background.alphabet):
829
            raise AlphabetMismatch(self.alphabet, background.alphabet)
830
        if not self.alphabet._eq(sequence.alphabet):
831
            raise AlphabetMismatch(self.alphabet, sequence.alphabet)
832

833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
        # only load the score system if it hasn't been loaded already,
        # or if a different score system is in use
        if self._bld.S == NULL or strcmp(self._bld.S.name, mx_ptr) != 0:
            with nogil:
                status = libhmmer.p7_builder.p7_builder_LoadScoreSystem(
                    self._bld,
                    mx_ptr,
                    self.popen,
                    self.pextend,
                    background._bg
                )
            if status == libeasel.eslEINVAL:
                msg = self._bld.errbuf.decode('utf-8', 'ignore')
                raise RuntimeError(f"failed to set single sequence score system: {msg}")
            elif status != libeasel.eslOK:
                raise UnexpectedError(status, "p7_builder_LoadScoreSystem")
        else:
            self._bld.popen = self.popen
            self._bld.pextend = self.pextend
852
853
854

        # build HMM and profiles
        with nogil:
855
856
857
858
859
860
861
862
863
            status = libhmmer.p7_builder.p7_SingleBuilder(
                self._bld,
                sequence._sq,
                background._bg,
                &hmm._hmm, # HMM
                NULL, # traceback
                &profile._gm, # profile,
                &opti._om, # optimized profile
            )
864
        if status == libeasel.eslOK:
865
            hmm.command_line = " ".join(sys.argv)
866
        elif status == libeasel.eslEINVAL:
867
868
            msg = self._bld.errbuf.decode("utf-8", "replace")
            raise ValueError("Could not build HMM: {}".format(msg))
869
870
871
        else:
            raise UnexpectedError(status, "p7_SingleBuilder")

872
873
874
        # return newly built HMM, profile and optimized profile
        return hmm, profile, opti

875
876
877
878
879
    cpdef tuple build_msa(
        self,
        DigitalMSA msa,
        Background background,
    ):
880
881
882
883
884
885
886
887
888
889
        """build_msa(self, msa, background)\n--

        Build a new HMM from ``msa`` using the builder configuration.

        Arguments:
            msa (`~pyhmmer.easel.DigitalMSA`): A multiple sequence
                alignment in digital mode to build a HMM with.
            background (`pyhmmer.plan7.background`): The background model
                to use to create the HMM.

890
891
892
893
        Returns:
            (`HMM`, `Profile`, `OptimizedProfile`): A tuple containing the
            new HMM as well as profiles to be used directly in a `Pipeline`.

894
        Raises:
895
896
            `~pyhmmer.errors.AlphabetMismatch`: When either ``msa`` or
                ``background`` have the wrong alphabet for this builder.
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
            `ValueError`: When the HMM cannot be created successfully
                from the input; the error message contains more details.

        Caution:
            HMMER requires that every HMM has a name, so the `Builder` will
            attempt to use the name of the sequence to name the HMM. Passing
            an MSA without a name will result in an error::

                >>> alphabet = easel.Alphabet.amino()
                >>> msa = easel.TextMSA(sequences=[
                ...   easel.TextSequence(name=b"ustiA", sequence="YAIG"),
                ...   easel.TextSequence(name=b"ustiB", sequence="YVIG")
                ... ])
                >>> builder = plan7.Builder(alphabet)
                >>> background = plan7.Background(alphabet)
                >>> builder.build_msa(msa.digitize(alphabet), background)
                Traceback (most recent call last):
                  ...
                ValueError: Could not build HMM: Unable to name the HMM.
916

917
918
        .. versionadded:: 0.3.0

919
920
921
        .. versionchanged:: 0.4.6
           Sets the `HMM.creation_time` attribute with the current time.

922
923
924
        """
        assert self._bld != NULL

925
926
927
928
        cdef int              status
        cdef HMM              hmm     = HMM.__new__(HMM)
        cdef Profile          profile = Profile.__new__(Profile)
        cdef OptimizedProfile opti    = OptimizedProfile.__new__(OptimizedProfile)
929
        cdef str              msg
930
        cdef size_t           k
931

932
933
        # unset builder probabilities and score system if they were set
        # by a call to `build`, since we won't be needing them here
934
        self._bld.popen = self._bld.pextend = -1
935
936
937
938
939
940
        if self._bld.S != NULL:
            libeasel.scorematrix.esl_scorematrix_Destroy(self._bld.S)
            self._bld.S = NULL
        if self._bld.Q != NULL:
            libeasel.dmatrix.esl_dmatrix_Destroy(self._bld.Q)
            self._bld.Q = NULL
941
942
943

        # reseed RNG used by the builder if needed
        if self._bld.do_reseeding:
944
            self.randomness._seed(self.seed)
945
946
947

        # check alphabet and assign it to newly created objects
        hmm.alphabet = profile.alphabet = opti.alphabet = self.alphabet
948
        if not self.alphabet._eq(background.alphabet):
949
            raise AlphabetMismatch(self.alphabet, background.alphabet)
950
        if not self.alphabet._eq(msa.alphabet):
951
            raise AlphabetMismatch(self.alphabet, msa.alphabet)
952

953
        # build HMM
954
955
956
957
958
959
960
961
962
963
964
965
966
        with nogil:
            # build HMM and profiles
            status = libhmmer.p7_builder.p7_Builder(
                self._bld,
                msa._msa,
                background._bg,
                &hmm._hmm, # HMM
                NULL, # traceback
                &profile._gm, # profile,
                &opti._om, # optimized profile
                NULL # modified msa
            )
        if status == libeasel.eslOK:
967
            hmm.command_line = " ".join(sys.argv)
968
        elif status == libeasel.eslEINVAL:
969
970
            msg = self._bld.errbuf.decode("utf-8", "replace")
            raise ValueError("Could not build HMM: {}".format(msg))
971
972
        else:
            raise UnexpectedError(status, "p7_Builder")
973

974
975
976
        # return newly built HMM, profile and optimized profile
        return hmm, profile, opti

977
    cpdef Builder copy(self):
978
979
980
981
        """copy(self)\n--

        Create a duplicate `Builder` instance with the same arguments.

982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
        """
        assert self._bld != NULL
        return Builder(
            self.alphabet,
            architecture=self.architecture,
            weighting=self.weighting,
            effective_number=self.effective_number,
            prior_scheme=self.prior_scheme,
            symfrac=self._bld.symfrac,
            fragthresh=self._bld.fragthresh,
            wid=self._bld.wid,
            esigma=self._bld.esigma,
            eid=self._bld.eid,
            EmL=self._bld.EmL,
            EmN=self._bld.EmN,
            EvL=self._bld.EvL,
            EvN=self._bld.EvN,
999
1000
            EfL=self._bld.EfL,
            EfN=self._bld.EfN,
For faster browsing, not all history is shown. View entire blame