print '%(language)04d a nawiasy {} ' % {"language": 1234, "number": 2}

print '{zmienna} a nawiasy: {{}}'.format( **{"zmienna": 123} )

import pycuda.gpuarray as gpuarray

import numpy
from pycuda.curandom import rand as curand
from pycuda.compiler import SourceModule
import pycuda.driver as cuda


cuda.init()
device = cuda.Device(0)
ctx = device.make_context()
print device.name(), device.compute_capability(),device.total_memory()/1024.**3,"GB"


blocks = 2**11
block_size = 2**8
N = blocks*block_size

omega = 4.9
spp = 100
dt = 2.0*np.pi/omega/spp
pars = {'samples':spp,'N':N,'dt':dt,'gam':0.9,'d0':0.001,'omega':omega,'force':0.1,'amp':4.2}

rng_src = """
#define PI 3.14159265358979f
/*
 * Return a uniformly distributed random number from the
 * [0;1] range.
 */
__device__ float rng_uni(unsigned int *state)
{
	unsigned int x = *state;

	x = x ^ (x >> 13);
	x = x ^ (x << 17);
	x = x ^ (x >> 5);

	*state = x;

	return x / 4294967296.0f;
}
/*
 * Generate two normal variates given two uniform variates.
 */
__device__ void bm_trans(float& u1, float& u2)
{
	float r = sqrtf(-2.0f * logf(u1));
	float phi = 2.0f * PI * u2;
	u1 = r * cosf(phi);
	u2 = r * sinf(phi);
}

"""

src = """
  __device__ inline void diffEq(float &nx, float &nv, float x, float v, float t)
{{
	nx = v;
	nv = -2.0f * PI * cosf(2.0f * PI * x) + {amp}f * cosf({omega}f * t) + {force}f - {gam}f * v;
}}

__global__ void SDE(float *cx,float *cv,unsigned int *rng_state, float ct)
    {{
      int idx = blockDim.x*blockIdx.x + threadIdx.x;
      float n1, n2; 	
      unsigned int lrng_state;
      float xim, vim, xt1, vt1, xt2, vt2,t,x,v;
      lrng_state = rng_state[idx]; 
      t = ct;
      x = cx[idx];
	  v = cv[idx]; 
    
      for (int i=1;i<={samples};i++)  {{
      	n1 = rng_uni(&lrng_state);
		n2 = rng_uni(&lrng_state);
		bm_trans(n1, n2);
	diffEq(xt1, vt1, x, v, t);
		xim = x + xt1 * {dt}f;
		vim = v + vt1 * {dt}f + sqrtf({dt}f * {gam}f * {d0}f * 2.0f) * n1;
		t = ct + i * {dt}f;
	diffEq(xt2, vt2, xim, vim, t);
		x += 0.5f * {dt}f * (xt1 + xt2);
		v += 0.5f * {dt}f * (vt1 + vt2) + sqrtf(2.0f * {dt}f * {gam}f * {d0}f) * n2;
      }}
       cx[idx] = x;
	   cv[idx] = v;

	   rng_state[idx] = lrng_state;;
       
    }}
    """.format(**pars)

mod = SourceModule(rng_src + src,options=["--use_fast_math"])
SDE = mod.get_function("SDE")

print "kernel ready for ",block_size,"N =",N,N/1e6

print spp,N

import time

x  = np.zeros(N,dtype=np.float32)
v  = np.ones(N,dtype=np.float32)
rng_state = np.array(np.random.randint(1,2147483648,size=N),dtype=np.uint32)

x_g = gpuarray.to_gpu(x)
v_g = gpuarray.to_gpu(v)
rng_state_g = gpuarray.to_gpu(rng_state)

start = time.time()
for i in range(0,200000,spp):
    t = i * 2.0 * np.pi /omega /spp;
    SDE(x_g, v_g, rng_state_g, np.float32(t), block=(block_size,1,1), grid=(blocks,1))

ctx.synchronize()
elapsed = (time.time() - start)
x=x_g.get()
print elapsed,N/1e6, 200000*N/elapsed/1024.**3,"Giter/sek"

h=histogram(x,bins=50,range=(-150, 100) ) 
figsize(8,4)
plot(h[1][1:],h[0])


hist_ref = (array([   46,    72,   134,   224,   341,   588,   917,  1504,  2235,\
        3319,  4692,  6620,  8788, 11700, 15139, 18702, 22881, 26195,\
       29852, 32700, 35289, 36232, 36541, 35561, 33386, 30638, 27267,\
       23533, 19229, 16002, 12646,  9501,  7111,  5079,  3405,  2313,\
        1515,   958,   573,   370,   213,   103,    81,    28,    15,\
           7,     3,     2,     0,     0]),\
 array([-150., -145., -140., -135., -130., -125., -120., -115., -110.,\
       -105., -100.,  -95.,  -90.,  -85.,  -80.,  -75.,  -70.,  -65.,\
        -60.,  -55.,  -50.,  -45.,  -40.,  -35.,  -30.,  -25.,  -20.,\
        -15.,  -10.,   -5.,    0.,    5.,   10.,   15.,   20.,   25.,\
         30.,   35.,   40.,   45.,   50.,   55.,   60.,   65.,   70.,\
         75.,   80.,   85.,   90.,   95.,  100.]) )


hist(x,bins=50,range=(-150, 100) ) 
plot((hist_ref[1][1:]+hist_ref[1][:-1])/2.0,hist_ref[0],'r')


from pycuda.elementwise import ElementwiseKernel
SDE2 = ElementwiseKernel(
        "float *x,unsigned int  *rng_state",
        "x[i] = x[i]+0.01*f(x[i]) + sqrtf(0.01*0.1)*rng_uni( &(rng_state[i]) ) ",
        "SDE",
        preamble="""
        __device__ float f(float x)
        { 
          return sin(x)+0.1;
        }
        """+rng_src)