neon.c 3.54 KB
Newer Older
1
#include "sequence.h"
2
#include "neon.h"
3
#include "generic.h"
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32

#ifdef __ARM_NEON__

#include <arm_neon.h>

void skippable_neon(
    const int8_t* strands,
    const uint8_t* types,
    const uint8_t* frames,

    const int min,
    const int i,
    uint8_t* skip
) {

  const uint8x16_t all_stops  = vdupq_n_u8(STOP);
  const uint8x16_t all_fwd    = vdupq_n_u8(1);
  const uint8x16_t all_bwd    = vdupq_n_u8(-1);

  int j;
  uint8x16_t x;
  uint8x16_t s;
  uint8x16_t n1_strands;
  uint8x16_t n1_types;
  uint8x16_t n1_frames;
  uint8x16_t n2_strands = vdupq_n_u8(strands[i]);
  uint8x16_t n2_types   = vdupq_n_u8(types[i]);
  uint8x16_t n2_frames  = vdupq_n_u8(frames[i]);

33
34
35
  for (j = min; j < ((min + 0xF) & (~0xF)); j++)
      skippable_generic_single(strands, types, frames, j, i, skip);
  for (; j + 15 < i; j += 16) {
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
      n1_strands = vld1q_u8((uint8_t*) &strands[j]);
      n1_types   = vld1q_u8(&types[j]);
      n1_frames  = vld1q_u8(&frames[j]);
      s          = vdupq_n_u8(0);
      // 5'fwd->5'fwd
      // n1->strand == n2->strand && n2->type != STOP && n1->type != STOP
      x =                  vceqq_u8(n1_strands, n2_strands);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n2_types, all_stops)),    x);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n1_types, all_stops)),    x);
      s = vorrq_u8(                                         s, x);
      // 5'fwd->5'ref, 5'fwd->3'rev
      // n2->strand == -1 && n1->strand == 1 && n1->type != STOP
      x =                  vceqq_u8(n2_strands, all_bwd);
      x = vandq_u8(   vceqq_u8(n1_strands, all_fwd), x);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n1_types, all_stops)), x);
      s = vorrq_u8(                                      s, x);
      // 5'fwd
      // n1->type == STOP && n1->strand == -1 && n2->strand == -1
      x =               vceqq_u8(n1_types, all_stops);
      x = vandq_u8(vceqq_u8(n1_strands, all_bwd), x);
      x = vandq_u8(vceqq_u8(n2_strands, all_fwd), x);
      s = vorrq_u8(                                   s, x);
      // 5'rev->3'fwd
      // n2->type == STOP && n1->strand == -1 && n2->strand == 1 && n1->type != STOP
      x =                  vceqq_u8(n2_types, all_stops);
      x = vandq_u8(   vceqq_u8(n1_strands, all_bwd), x);
      x = vandq_u8(   vceqq_u8(n2_strands, all_fwd), x);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n1_types, all_stops)), x);
      s = vorrq_u8(                                      s, x);
      // 5'fwd->3'fwd
      // n1->strand == n2->strand && n1->strand == 1 && n1->type != STOP && n2->type == STOP && n1->ndx%3 != n2->ndx%3
      x =                  vceqq_u8(n1_strands, n2_strands);
      x = vandq_u8(   vceqq_u8(n1_strands, all_fwd),    x);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n1_types,   all_stops)),  x);
      x = vandq_u8(   vceqq_u8(n2_types,   all_stops),  x);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n1_frames,  n2_frames)),  x);
      s = vorrq_u8(                                        s,  x);
      // 3'rev->5'rev
      // n1->strand == n2->strand && n1->strand == -1 && n1->type == STOP && n2->type != STOP && n1->ndx%3 != n2->ndx%3
      x =                   vceqq_u8(n1_strands, n2_strands);
      x = vandq_u8(         vceqq_u8(n1_strands, all_bwd),    x);
      x = vandq_u8(         vceqq_u8(n1_types,   all_stops),  x);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n2_types,   all_stops)), x);
      x = vandq_u8(vmvnq_u8(vceqq_u8(n1_frames,  n2_frames)), x);
      s = vorrq_u8(                                        s, x);

      // store result mask
      vst1q_u8(&skip[j], s);
  }
85
86
  for (; j < i; j++)
      skippable_generic_single(strands, types, frames, j, i, skip);
87
88
}
#endif