Commit 1bf87c99 authored by Chris Kerr's avatar Chris Kerr

New alm_add_atom kernel using builtin dot() function

In theory this should be faster, but I have not done any tests
to demonstrate this yet.
parent 78840013
......@@ -97,6 +97,96 @@ __kernel void alm_add_many_atoms(const unsigned int nAt,
}
}
#ifdef WORKSIZE
__attribute__((reqd_work_group_size(1,WORKSIZE,1)))
#endif
__kernel void alm_add_16_atoms(const unsigned int nAt,
__global const cfloat_t *restrict ylm, //Y(L,M,theta,phi) dimensions[LM]
__global const float *restrict jl, //Jy(L,s*r) dimensions[L,s]
__global const float *restrict ff, //Form factor dimensions[*,s]
__constant unsigned int *ffindex, //Index into form factor array
__global cfloat_t *restrict alm, //A(L,M,s) dimensions[LM,s]
__local float* ylm_local)
{
const uint L = get_global_id(0);
const uint LMAX1 = get_global_size(0);
const size_t is = get_global_id(1);
const size_t NS = get_global_size(1);
//assert(get_local_size(0) == 1)
const uint ilocal = get_local_id(1);
const uint Nlocal = get_local_size(1);
const size_t is_group0 = get_group_id(1) * get_local_size(1);
const size_t il0 = L * (L+1) / 2;
const size_t NLM = (LMAX1 * (LMAX1+1)) / 2;
const size_t NLS = LMAX1 * NS;
const size_t ils = L * NS + is;
union {
float16 vec;
float arr[16];
} ffjl;
for (uint iAt=0; iAt<nAt; ++iAt) {
uint iFF = ffindex[iAt];
size_t iFFS = iFF * NS + is;
size_t iAtLS = iAt * NLS + ils;
ffjl.arr[iAt] = ff[iFFS] * jl[iAtLS];
}
for (uint i=ilocal; i<16*(L+1); i+=Nlocal) {
uint M = i % (L+1);
uint iAt = i / (L+1);
size_t iAtMr = 16 * M + iAt;
size_t iAtMi = 16 * (LMAX1+M) + iAt;
if (iAt < nAt) {
size_t iAtLM = iAt * NLM + il0 + M;
cfloat_t ylmtmp = ylm[iAtLM];
ylm_local[iAtMr] = cfloat_real(ylmtmp);
ylm_local[iAtMi] = cfloat_imag(ylmtmp);
} else {
ylm_local[iAtMr] = 0;
ylm_local[iAtMi] = 0;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
for (uint M=0; M<=L; ++M) {
size_t ilm = il0 + M;
size_t ilms = ilm * NS + is;
float16 ylmvec;
cfloat_t almtot = alm[ilms];
ylmvec = vload16(M, ylm_local); // real part
float newalm_real = 0;
newalm_real += dot(ffjl.vec.s0123, ylmvec.s0123);
newalm_real += dot(ffjl.vec.s4567, ylmvec.s5467);
newalm_real += dot(ffjl.vec.s89ab, ylmvec.s89ab);
newalm_real += dot(ffjl.vec.scdef, ylmvec.scdef);
//almtot = cfloat_addr(almtot, newalm_real);
ylmvec = vload16(LMAX1+M, ylm_local); // imaginary part
float newalm_imag = 0;
newalm_imag += dot(ffjl.vec.s0123, ylmvec.s0123);
newalm_imag += dot(ffjl.vec.s4567, ylmvec.s5467);
newalm_imag += dot(ffjl.vec.s89ab, ylmvec.s89ab);
newalm_imag += dot(ffjl.vec.scdef, ylmvec.scdef);
//almtot = cfloat_addi(almtot, newalm_imag);
almtot = cfloat_add(almtot, cfloat_new(newalm_real, newalm_imag));
alm[ilms] = almtot;
}
}
// Squared magnitude of a complex number
float cfloat_abs2(const cfloat_t z)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment