Verified Commit 07ecbb35 authored by Chris Kerr's avatar Chris Kerr

Merge branch 'master' into 47-use-pyopencl-cltypes

Changed dtypes in the new code that was being merged in
parents e21a0101 8122627a
Pipeline #9384 failed with stages
in 24 seconds
......@@ -29,20 +29,17 @@
__attribute__((reqd_work_group_size(WORKSIZE,1,1)))
#endif
__kernel void jsph(const unsigned int LMAX,
__constant float *r,
__global const float *restrict s,
__global float *restrict jl)
const float r,
__constant float *restrict s,
__global float *restrict result)
{
const size_t is = get_global_id(0);
const size_t NS = get_global_size(0);
const unsigned int iAt = get_global_id(1);
const size_t NLS = (LMAX+1) * NS;
unsigned int L = 0;
size_t iAtls = iAt*NLS + is;
size_t ils = is;
const float rs = r[iAt] * s[is];
const float rs = r * s[is];
// For r*s <=2, use the first NPOWER terms of the power series
if (rs <= 2)
......@@ -72,7 +69,7 @@ __kernel void jsph(const unsigned int LMAX,
for (i=NPOWER-1; i>=0; --i) {
total += powers[i];
}
jl[iAtls] = total;
result[ils] = total;
// For higher L the terms of the power series are derived
// using the formula:
......@@ -94,13 +91,13 @@ __kernel void jsph(const unsigned int LMAX,
// to the ith non-zero term of J_L(x) is: x/(2*L+2*i+1)
for (L=1; L<=LMAX; ++L) {
iAtls = iAt*NLS + L*NS + is;
ils = L * NS + is;
total = 0;
for (i=NPOWER-1; i>=0; --i) {
powers[i] *= rs / (2*i + 2*L + 1);
total += powers[i];
}
jl[iAtls] = total;
result[ils] = total;
}
}
// For r*s > 2, the power series requires too many terms to converge,
......@@ -124,18 +121,18 @@ __kernel void jsph(const unsigned int LMAX,
float t2;
// sum_up is used in the normalisation of the downwards iteration
float sum_up = t1*t1;
jl[iAtls] = t1;
result[ils] = t1;
// Calculate Jl for L <= r*s using the upwards recurrence relation
// The first terms of the upwards recurrence are known exactly, so
// there is no need for a pre-iteration or any normalisation
for (L=1; L<=LMAX_up; ++L) {
iAtls = iAt*NLS + L*NS + is;
ils = L * NS + is;
t2 = (2*L-1) * t1 * rX - t0;
t0 = t1;
t1 = t2;
sum_up += (2*L+1) * t2*t2;
jl[iAtls] = t2;
result[ils] = t2;
}
// If LMAX > r*s, the remaining L values must be calculated using the
......@@ -160,8 +157,8 @@ __kernel void jsph(const unsigned int LMAX,
// The number is stored before frexp shifting so that the exponent
// can be retrieved later during normalisation
if (L <= LMAX) {
iAtls = iAt*NLS + L*NS + is;
jl[iAtls] = t2;
ils = L * NS + is;
result[ils] = t2;
}
// Shift large numbers to the range 0.5-1.0 to avoid overflow
exponent = ilogb(t2);
......@@ -178,14 +175,14 @@ __kernel void jsph(const unsigned int LMAX,
// This gives us the normalisation factor for the downward recurrence
float normalise = sqrt((1-sum_up)/sum_down);
for (L=LMAX_up+1; L<=LMAX; ++L) {
iAtls = iAt*NLS + L*NS + is;
t2 = jl[iAtls];
ils = L * NS + is;
t2 = result[ils];
// Load the exponent and adjust the normalisation constant
exponent = ilogb(t2);
if (exponent > 0) {
normalise = ldexp(normalise, -exponent);
}
jl[iAtls] = t2 * normalise;
result[ils] = t2 * normalise;
}
}
}
......@@ -193,6 +190,29 @@ __kernel void jsph(const unsigned int LMAX,
}
/* Many-atom version of the spherical Bessel J_L(r*s) kernel
*
* For arrays r and s, calculate the Sperical Bessel function of the first
* kind of r*s, for L values from 0 to LMAX.
*/
#ifdef WORKSIZE
__attribute__((reqd_work_group_size(WORKSIZE,1,1)))
#endif
__kernel void jsph_batch(
const unsigned int LMAX,
__constant float *restrict r,
__constant float *restrict s,
__global float *restrict result)
{
const size_t NS = get_global_size(0);
const unsigned int iAt = get_global_id(1);
const size_t NLS = (LMAX+1) * NS;
jsph(LMAX, r[iAt], s, result + (iAt * NLS));
}
/* Integral of the spherical Bessel function
*
* Given as input an array of Bessel function values J_L(r*s)
......@@ -211,16 +231,16 @@ __kernel void jsph(const unsigned int LMAX,
* must fit into local memory (i.e. max 8192 samples).
*
* The work group shape should be (WORKSIZE, 1, 1) and
* the number of groups should be (1, LMAX+1, nAt)
* the number of groups should be (1, LMAX+1, 1)
* NS should divide evenly by WORKSIZE (i.e. NS % WORKSIZE == 0)
*/
#ifdef WORKSIZE
__attribute__((reqd_work_group_size(WORKSIZE,1,1)))
#endif
__kernel void jsph_integral(const unsigned int NS,
__constant float *r,
__global const float *restrict s,
__global float *restrict jl,
const float r,
__constant float *restrict s,
__global float *restrict jl,
__local float *jlsum)
{
const unsigned int ithread = get_local_id(0);
......@@ -231,20 +251,16 @@ __kernel void jsph_integral(const unsigned int NS,
#endif
const unsigned int nperthread = NS / nthreads;
const unsigned int L = get_global_id(1);
const unsigned int LMAX1 = get_global_size(1);
const unsigned int iAt = get_global_id(2);
const size_t NLS = LMAX1 * NS;
// Read the jl and s arrays and calculate the trapezium
// area of the individual steps
for (uint ibase=0; ibase < NS; ibase += nthreads) {
size_t is = ibase + ithread;
size_t iAtls = iAt*NLS + L*NS + is;
size_t ils = L*NS + is;
if (is > 0) {
float j1 = jl[iAtls-1];
float j2 = jl[iAtls];
float j1 = jl[ils-1];
float j2 = jl[ils];
float s1 = s[is-1];
float s2 = s[is];
jlsum[is] = (j2 + j1) * (pown(s2, 3) - pown(s1, 3)) / 6;
......@@ -298,11 +314,11 @@ __kernel void jsph_integral(const unsigned int NS,
// Scale by R/S to get the integral over R not S
for (uint ibase=0; ibase < NS; ibase += nthreads) {
size_t is = ibase + ithread;
size_t iAtls = iAt*NLS + L*NS + is;
size_t ils = L*NS + is;
if (is > 0) {
jl[iAtls] = jlsum[is] * pown(r[iAt] / s[is], 3);
jl[ils] = jlsum[is] * pown(r / s[is], 3);
} else {
jl[iAtls] = (L == 0) ? pown(r[iAt], 3) / 3 : 0;
jl[ils] = (L == 0) ? pown(r, 3) / 3 : 0;
}
}
}
\ No newline at end of file
}
......@@ -107,10 +107,10 @@ cfloat_t cfloat_pown(cfloat_t Z, uint N)
#ifdef LMAX1
__attribute__((reqd_work_group_size(LMAX1,1,1)))
#endif
__kernel void ylm(__constant float *costheta,
__constant float *sintheta,
__constant cfloat_t *eiphi,
__global cfloat_t *restrict ylm,
__kernel void ylm(const float costheta,
const float sintheta,
const cfloat_t eiphi,
__global cfloat_t *restrict result,
__local float *MpartF,
__local int *MpartE)
{
......@@ -119,9 +119,6 @@ __kernel void ylm(__constant float *costheta,
const unsigned int LMAX1 = get_global_size(0); // LMAX + 1
#endif
const unsigned int M = get_global_id(0);
const unsigned int iAt = get_global_id(1);
const size_t NLM = (LMAX1 * (LMAX1+1)) / 2;
unsigned int L;
float term2 = 0;
......@@ -131,11 +128,11 @@ __kernel void ylm(__constant float *costheta,
int termE = 0;
int sumexp = 0;
const cfloat_t eiMphi = cfloat_pown(eiphi[iAt], M);
const cfloat_t eiMphi = cfloat_pown(eiphi, M);
for (L=0; L<LMAX1; ++L) {
ylm_Mpart_local(L, sintheta[iAt], MpartF, MpartE);
ylm_Mpart_local(L, sintheta, MpartF, MpartE);
// This calculates the cos(theta) part of the Legendre polynomial in term0
// (the sin(theta) part is calculated in ylm_Mpart_local)
......@@ -146,7 +143,7 @@ __kernel void ylm(__constant float *costheta,
// (L-M+1)*P[L+1,M](x) = (2*L+1)*x*P[L,M](x) - (L+M)*P[L-1,M](x)
// i.e. (L-M)*P[L,M](x) = (2*L-1)*x*P[L-1,M](x) - (L+M-1)*P[L-2,M](x)
// For L == M + 1, P[L-2,M](x) is taken to be zero
term0 = ( costheta[iAt] * (2*L - 1) * term1
term0 = ( costheta * (2*L - 1) * term1
- (L+M-1) * term2) / (L-M);
}
......@@ -157,9 +154,8 @@ __kernel void ylm(__constant float *costheta,
// Now calculate Ylm = (cospart) * (sinpart) * exp(i*M*phi)
if (L >= M) {
size_t ilm = (L*(L+1))/2 + M;
size_t iAtlm = iAt * NLM + ilm;
float magnitude = ldexp(term0 * MpartF[M], sumexp + MpartE[M]);
ylm[iAtlm] = cfloat_mulr(eiMphi, magnitude);
result[ilm] = cfloat_mulr(eiMphi, magnitude);
}
// Rotate term0 to term1 and term1 to term2
......@@ -178,3 +174,33 @@ __kernel void ylm(__constant float *costheta,
}
}
}
/* Many-atom version of the spherical harmonic Y_lm(theta, phi) kernel
*
* The first work group dimension is M and the second is the atom index.
* The calculation for each atom goes in a separate work group of size LMAX1.
* The result is written into a packed array of dimension (nAtoms, NLM).
*/
#ifdef LMAX1
__attribute__((reqd_work_group_size(LMAX1,1,1)))
#endif
__kernel void ylm_batch(
__constant float *restrict costheta,
__constant float *restrict sintheta,
__constant cfloat_t *restrict eiphi,
__global cfloat_t *restrict result,
__local float *restrict MpartF,
__local int *restrict MpartE)
{
#ifndef LMAX1
const unsigned int LMAX1 = get_global_size(0); // LMAX + 1
#endif
#ifndef NLM
const size_t NLM = LMAX1 * (LMAX1 + 1) / 2;
#endif
const unsigned int iAt = get_global_id(1);
ylm(costheta[iAt], sintheta[iAt], eiphi[iAt],
result + (iAt * NLM), MpartF, MpartE);
}
\ No newline at end of file
......@@ -168,8 +168,8 @@ class AtomicScattering(PackedLMMixin, ClProgram):
r, costheta, sintheta, eiphi = sicopolar_array(xyz[iAt0:iAtN,:])
evY, ylm_dev = self.sphharm(costheta, sintheta, eiphi, queue=queue)
evJ, jl_dev = self.sphbes(r, queue=queue)
evY, ylm_dev = self.sphharm.batch(costheta, sintheta, eiphi, queue=queue)
evJ, jl_dev = self.sphbes.batch(r, queue=queue)
ff32 = np.array(ff[iAt0:iAtN,:], dtype=cl_float)
ff_dev = cl_array.to_device(queue, ff32)
......
......@@ -49,13 +49,29 @@ class SphericalBessel(LMAXMixin, ClProgram):
with cl.CommandQueue(context) as queue:
self.s_dev = cl_array.to_device(queue, self.s_host)
@optional_out_array(shape=('natwork', 'LMAX1', 'NS'), dtype=cl_float)
@optional_out_array(shape=('LMAX1', 'NS'), dtype=cl_float)
def __call__(self, r, queue=None, out=None):
"""Calculate the spherical Bessel function j_L(r*s) for L<=LMAX
The calculation is done asynchronously, the returned OpenCL
event object can be used to wait for the result."""
r = np.array(r, dtype=cl_float)
assert r >= 0
NS = self.NS
worksize = self.worksize
ev = self.program.jsph(queue,
(NS,),
(worksize,),
cl_uint(self.LMAX),
cl_float(r),
self.s_dev.data,
out.data)
return ev, out
@optional_out_array(shape=('natwork', 'LMAX1', 'NS'), dtype=cl_float)
def batch(self, r, queue=None, out=None):
"""Batch spherical Bessel function for an array of r values."""
assert np.all(r >= 0)
NS = self.NS
worksize = self.worksize
......@@ -64,7 +80,7 @@ class SphericalBessel(LMAXMixin, ClProgram):
assert nAt <= self.natwork
r_dev = cl_array.to_device(queue, r)
ev = self.program.jsph(queue,
ev = self.program.jsph_batch(queue,
(NS, nAt),
(worksize, 1),
cl_uint(self.LMAX),
......@@ -74,38 +90,34 @@ class SphericalBessel(LMAXMixin, ClProgram):
return ev, out
@optional_out_array(shape=('natwork', 'LMAX1', 'NS'), dtype=cl_float)
@optional_out_array(shape=('LMAX1', 'NS'), dtype=cl_float)
def integral(self, r, queue=None, out=None):
"""Calculate the spherical Bessel integral \Integral{0}{r} j_L(r'*s) dr' for L<=LMAX
"""Calculate the spherical Bessel integral \\Integral{0}{r} j_L(r'*s) dr' for L<=LMAX
The calculation is done asynchronously, the returned OpenCL
event object can be used to wait for the result."""
r = np.array(r, dtype=np.float32)
assert np.all(r >= 0)
event object can be used to wait for the result.
"""
assert r >= 0
NS = self.NS
LMAX = self.LMAX
LMAX1 = self.LMAX1
s_dev = self.s_dev
worksize = self.worksize
nAt = np.size(r)
assert nAt > 0
assert nAt <= self.natwork
r_dev = cl_array.to_device(queue, r)
ev1 = self.program.jsph(queue,
(NS, nAt),
(worksize, 1),
cl_uint(LMAX),
r_dev.data,
s_dev.data,
out.data)
(NS, 1),
(worksize, 1),
cl_uint(LMAX),
cl_float(r),
s_dev.data,
out.data)
localJL = cl.LocalMemory(cl_float().itemsize * NS)
ev = self.program.jsph_integral(queue,
(worksize, LMAX1 ,nAt),
(worksize, 1, 1),
(worksize, LMAX1),
(worksize, 1),
cl_uint(NS),
r_dev.data,
cl_float(r),
s_dev.data,
out.data,
localJL,
......
......@@ -7,7 +7,7 @@ __license__ = "LGPLv3+"
import numpy as np
import pyopencl as cl
from pyopencl import array as cl_array
import pyopencl.array as cl_array
from pyopencl.cltypes import float as cl_float
from .util import ClProgram, optional_out_array, PackedLMMixin
......@@ -38,15 +38,41 @@ class SphericalHarmonic(PackedLMMixin, ClProgram):
options=["-DLMAX1=%d" % (self.LMAX1,)],
context=context)
@optional_out_array(shape=('natwork', 'NLM'), dtype=cl_complex)
@optional_out_array(shape=('NLM',), dtype=cl_complex)
def __call__(self, costheta, sintheta, eiphi, queue=None, out=None):
"""Calculate all non-negative spherical harmonics up to LMAX
"""Calculate all non-negative spherical harmonics up to LMAX.
The calculation is done asynchronously, the returned OpenCL
event object can be used to wait for the result.
"""
assert -1 <= costheta
assert costheta <= 1
assert 0 <= sintheta # 0 <= theta <= pi
assert sintheta <= 1
LMAX1 = self.LMAX1
localmemF = cl.LocalMemory(4*LMAX1)
localmemI = cl.LocalMemory(4*LMAX1)
ev = self.program.ylm(queue,
(LMAX1,),
(LMAX1,),
cl_float(costheta),
cl_float(sintheta),
cl_complex(eiphi),
out.data,
localmemF,
localmemI)
return ev, out
@optional_out_array(shape=('natwork', 'NLM'), dtype=cl_complex)
def batch(self, costheta, sintheta, eiphi, queue=None, out=None):
"""Calculate spherical harmonics for a batch of atoms.
The calculation is done asynchronously, the returned OpenCL
event object can be used to wait for the result."""
costheta = np.array(costheta, dtype=cl_float)
sintheta = np.array(sintheta, dtype=cl_float)
eiphi = np.array(eiphi, dtype=cl_complex)
event object can be used to wait for the result.
"""
assert np.shape(costheta) == np.shape(sintheta)
assert np.shape(costheta) == np.shape(eiphi)
assert np.all(-1 <= costheta)
......@@ -65,14 +91,14 @@ class SphericalHarmonic(PackedLMMixin, ClProgram):
localmemF = cl.LocalMemory(4*LMAX1)
localmemI = cl.LocalMemory(4*LMAX1)
ev = self.program.ylm(queue,
(LMAX1, nAt),
(LMAX1, 1),
costh_dev.data,
sinth_dev.data,
eiphi_dev.data,
out.data,
localmemF,
localmemI)
ev = self.program.ylm_batch(queue,
(LMAX1, nAt),
(LMAX1, 1),
costh_dev.data,
sinth_dev.data,
eiphi_dev.data,
out.data,
localmemF,
localmemI)
return ev, out
......@@ -87,7 +87,7 @@ def test_single(cl_context, cl_queue,
ev.wait()
jl_cl = jl_dev.get(cl_queue)
tolerance_factor = max((1, smax * r))
npt.assert_allclose(jl_cl[0,:,:], jl_sp,
npt.assert_allclose(jl_cl, jl_sp,
rtol=(tolerance_factor * 1e-5),
atol=(tolerance_factor * 1e-8))
......@@ -114,7 +114,7 @@ def test_multiple(cl_context,
assume(nAt <= natwork)
s = np.linspace(0, smax, num=NS, endpoint=True, dtype='f4')
clBessel = SphericalBessel(LMAX, s, worksize, natwork, context=cl_context)
jl_dev = clBessel(r)
jl_dev = clBessel.batch(r)
jl_cl = jl_dev.get()
for iAt in range(nAt):
jl_sp = scipyBessel(LMAX, s, r[iAt])
......@@ -148,7 +148,7 @@ def test_integral_simple(cl_context, cl_queue,
ev.wait()
jl_cl = jl_dev.get(cl_queue)
tolerance_factor = max((1, smax * r))
npt.assert_allclose(jl_cl[0,:,:], jl_sp,
npt.assert_allclose(jl_cl, jl_sp,
rtol=(tolerance_factor * 1e-4),
atol=(tolerance_factor * 1e-6))
......@@ -184,6 +184,6 @@ def test_integral_quad(cl_context,
for i_s in i_s_test:
jl_sp = np.array([scipyBesselQuad(L, s[i_s], r)
for L in range(LMAX + 1)])
npt.assert_allclose(jl_cl[0,:,i_s], jl_sp,
npt.assert_allclose(jl_cl[:,i_s], jl_sp,
rtol=(tolerance_factor * 1e-3),
atol=(tolerance_factor * 1e-4))
......@@ -43,7 +43,7 @@ def test_single(cl_context, cl_queue, LMAX, theta, phi):
ev.wait()
ylm_cl = ylm_dev.get(cl_queue)
npt.assert_allclose(ylm_cl[0,:], ylm_sp, rtol=1e-4, atol=1e-6)
npt.assert_allclose(ylm_cl[:], ylm_sp, rtol=1e-4, atol=1e-6)
@settings(max_examples=10)
......@@ -72,7 +72,7 @@ def test_multiple(cl_context, cl_queue, LMAX, theta_phi):
# Rounding error sometimes gives theta slightly greater than pi
assume(np.all(sintheta >= 0))
ev, ylm_dev = clSphHarm(costheta, sintheta, eiphi, queue=cl_queue)
ev, ylm_dev = clSphHarm.batch(costheta, sintheta, eiphi, queue=cl_queue)
NLM = (LMAX+1)*(LMAX+2)//2
ylm_sp = np.empty((nAt, NLM), dtype=np.complex128)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment