#%install_ext http://bitbucket.org/mforbes/notes_ipython/raw/tip/notebook_utils/mmf_ext.py
#%install_ext /Users/mforbes/current/notes/ipython/notebook_utils/mmf_ext.py
#%reload_ext mmf_ext
%pylab inline
from IPython.display import display
import itertools
import sympy
from sympy import Eq, S
sympy.init_printing(use_latex=True)
import numpy as np
import scipy.special
import scipy.signal
import scipy as sp
from numpy.fft import fftn, ifftn, fftfreq
Populating the interactive namespace from numpy and matplotlib
In summary, we consider two solution based on the Truncated Kernel method discussed below. These are discussed in detail below, but we codify some functions that implement the methods here for testing.
import itertools
import numpy as np
from numpy.fft import fftn, ifftn
def C(k, D=None):
"""Fourier transform of the (truncated) kernel."""
if D is None:
return np.ma.divide(1.0, k**2).filled(0.0)
else:
return np.ma.divide(1 - np.cos(D*k), k**2).filled(D**2/2.0)
def resample(f, N):
"""Resample f to a new grid of size N"""
newshape = np.array(f.shape)
newshape[:] = N
fk = fftn(f)
fk1 = np.zeros(newshape, dtype=complex)
for _s in itertools.product(*
((slice(0, (_N + 1) // 2),
slice(-(_N - 1) // 2, None))
for _N in np.minimum(f.shape, newshape))):
fk1[_s] = fk[_s]
return ifftn(fk1) * np.prod(newshape.astype(float)/f.shape)
def get_k(N, L):
"""Return the sum of the squares of the momenta."""
k2 = np.zeros(N, dtype=float)
dim = len(N)
for _i, (n, l) in enumerate(zip(N, L)):
k = 2*np.pi * np.fft.fftfreq(n, l/n)
# Broadcast along the correct dimension
shape = [np.newaxis,]*dim
shape[_i] = slice(None)
k2 += k[shape]**2
return np.sqrt(k2)
def convolve_coulomb(n, L, linear=False):
"""Exact Coulomb potential for charge distribution n.
Arguments
---------
n : array
Charge distribution.
L : array
Dimensions of the box.
linear : bool
If `False`, then use the FFT for a periodic convolution, otherwise
use a linear convolution by padding and using a truncated kernel
to remove images:
.. math::
(1-\cos\sqrt{3}Lk)/k^2
"""
N = np.asarray(n.shape)
L = np.asarray(L)
dim = len(N)
if linear:
D = np.sqrt((L**2).sum()) # Diameter of cell
n_padded = np.zeros(3*N, dtype=n.dtype)
inds = [slice(0,_n) for _n in N]
n_padded[inds] = n
return ifftn(C(get_k(3*N, 3*L), D=D) * fftn(n_padded))[inds]
else:
return ifftn(C(get_k(N, L)) * fftn(n))
def convolve_coulomb_fast(n, L, corrected=False, reduction=3):
"""Return the Coulomb potential for density `n`"""
N = np.asarray(n.shape)
dim = len(N)
N0 = N // reduction
n0 = resample(n, N0)
V = resample(convolve_coulomb(n0, L=L, linear=True), N)
if corrected:
# Correct high frequency bits
V += convolve_coulomb(n - resample(n0, N), L=L)
return V
def Q(x, y, z):
"""Gaussian dumbell"""
Q = np.exp(-(x**2 + (y-y.max()/4)**2 + 2*z**2)/2.0)
#np.random.seed(1)
#Q += np.random.random(Q.shape)
return Q
def get_V(Q, N=32, L=16.0, corrected=False):
dx = L/N
x = np.arange(N)*dx - L/2
X, Y, Z = np.meshgrid(x, x, x)
R = np.sqrt(X**2 + Y**2 + Z**2)
n = Q(X, Y, Z)
V_exact = convolve_coulomb(n, L=(L,)*3, linear=True)
V_fast = convolve_coulomb_fast(n, L=(L,)*3,
reduction=3, corrected=corrected)
return R, n, V_fast, V_exact
%pylab inline
R, n, V_fast, V_exact = get_V(Q, 32, corrected=True)
plot(R.ravel(), n.ravel(), '.b', label='Density')
plot(R.ravel(), V_exact.ravel(), '+g', label='Potential')
plot(R.ravel(), V_fast.ravel(), '.r', label='Fast')
xlabel('r'); legend()
Populating the interactive namespace from numpy and matplotlib
/data/apps/anaconda/1.3.1/lib/python2.7/site-packages/numpy/core/numeric.py:460: ComplexWarning: Casting complex values to real discards the imaginary part return array(a, dtype, copy=False, order=order)
<matplotlib.legend.Legend at 0x10dab9d90>
figure(figsize=(20,10))
def plot_err(N, Q=Q, **kw):
R, n, V_fast, V_exact = get_V(Q, N, **kw)
semilogy(R.ravel(), abs((V_fast - V_exact)).ravel(), '.',
label="N=%i" % (N,))
subplot(121)
plot_err(16)
plot_err(32)
plot_err(64)
axis_box = axis()
title('Uncorrected'); ylabel('err in V'); xlabel('r'); legend()
subplot(122)
plot_err(16, corrected=True)
plot_err(32, corrected=True)
plot_err(64, corrected=True)
axis(axis_box)
title('Corrected'); ylabel('err in V'); xlabel('r'); legend()
<matplotlib.legend.Legend at 0x102885f50>
These plots show the deviation from the full expensive truncated calculation. In principle, the corrected version should be better, but with the exception of a few points, there seems to be not much point in doing the extra correction – most likely due to inaccuracies introduced by the interpolation.
We now show the performance gains:
import time
L = 16.0
Ns = np.array([8, 16, 32, 64, 128])
ts_fft = []
ts_exact = []
ts_fast = []
ts_fast_corrected = []
for N in Ns:
dx = L/N
x = np.arange(N)*dx - L/2
X, Y, Z = np.meshgrid(x, x, x)
n = Q(X, Y, Z)
tic = time.time()
V_exact = convolve_coulomb(n, L=(L,)*3, linear=True)
ts_exact.append(time.time() - tic)
tic = time.time()
ifftn(fftn(n))
ts_fft.append(time.time() - tic)
tic = time.time()
V_fast = convolve_coulomb_fast(n, L=(L,)*3,
reduction=3, corrected=False)
ts_fast.append(time.time() - tic)
tic = time.time()
V_fast = convolve_coulomb_fast(n, L=(L,)*3,
reduction=3, corrected=True)
ts_fast_corrected.append(time.time() - tic)
ts_fft = np.asarray(ts_fft)
plot(Ns**3*log10(Ns**3), ts_fft)
xlabel(r'$N^3 \log_{10}(N^3)$');ylabel('t (s)')
<matplotlib.text.Text at 0x10e6a7390>
semilogy(Ns, ts_fft/ts_fft, label='fft')
semilogy(Ns, ts_exact/ts_fft, '-+', label='exact')
semilogy(Ns, ts_fast/ts_fft, '-+', label='fast')
semilogy(Ns, ts_fast_corrected/ts_fft, '-+', label='corrected')
yticks([1,2,10,20,30,40,50], [1,2,10,20,30,40,50])
xlabel('N');ylabel('time (units of bare ifft(fft())'); legend(loc='best')
<matplotlib.legend.Legend at 0x10e6cc290>
This is the code model used in the code. To this, we add a Coulomb energy:
$$ E_C = \frac{q^2}{4\pi \epsilon_0}\int\d^3{\vect{x}}\d^3{\vect{y}} \frac{n(\vect{x})n(\vect{y})}{\abs{\vect{x}-\vect{y}}}. $$where each particle has charge $q$. The resulting piece in the Hamiltonian will have the form:
$$ \I\dot{\psi}(\vect{x}) = V[n](\vect{x})\psi(x), \qquad V[n](\vect{x}) = -\frac{q^2}{4\pi \epsilon_0} \int\d^3{\vect{y}}\frac{n(\vect{y})}{\abs{\vect{x}-\vect{y}}}. $$Naïvly this is easy. The form is simply a convolution and the Fourier Transform of the Coulomb potential is well known as a limiting case $\kappa \rightarrow 0$ of the Yukawa potential:
$$ V_{\text{Yukawa}}(r) = \frac{-g^2 e^{-\kappa r}}{r} = \int\frac{\d^3{\vect{k}}}{(2\pi)^3}\; e^{\I\vect{k}\cdot\vect{r}} \tilde{V}_{\text{Yukawa}}(\vect{k}),\\ \tilde{V}_{\text{Yukawa}}(\vect{k}) = \frac{-g^2 4\pi}{k^2 + \kappa^2}. $$Simply performing the convolution, however, will contaminate the results since the neighbouring cells have images from the periodic basis. Normally this is not a problem, but since the Coulomb potential is a long-range interaction, we need to be careful to avoid contamination. We want only the Couloumb potential from the charges in the principle cell. The terminology is "solving the Poisson equation with free boundary conditions" or "cluster boundary conditions" rather than with "periodic boundary conditions".
Another issue is that at $k=0$ the Fourier transform is singular in the limit $\kappa \rightarrow 0$. This is simply dealt with by setting $\tilde{V}(\vect{0}) = 0$ which corresponds to subtracting a uniform charge density to make the system neutral. For a purely periodic system, this is the best we can do because the system is not extensive and any net charge will give a physically diverging potential.
Solutions are discussed in Castro:2000 but a particularly simple solution is to define a modified "Coulomb" potential that is zero outside of a radias $D$:
$$ V_D(r) = \begin{cases} \tfrac{1}{r} & \text{r < D}\\ 0 & \text{otherwise} \end{cases} = \int\frac{\d^3{\vect{k}}}{(2\pi)^3}\; e^{\I\vect{k}\cdot\vect{r}} \frac{4\pi(1-\cos kD)}{k^2}. $$If we choose $D = \sqrt{3}L$, then we will obtain all contributions from the current cell, but no contributions from cells two-units away. Thus, by imbedding our cell in a super-cell 3 times larger, we can eliminate any contributions from outside of the main cell.
Simply implemented, this would require applying the FFT to a $(3N)^3$ cell instead (padding the function with zeros), but one can improve this by instead computing 27 FFT's in the orginal cell of the function.
(See http://www.if.pw.edu.pl/~magiersk/prezentacje/tddft_ect.pdf for details).
An alternate approach is considered in Genovese:2006. Here the idea is to represent the density in terms of a set of interpolating functions:
$$ n(\vect{x}) = \sum_{ijk} n_{ijk}\phi_i(x)\phi_j(y)\phi_k(z) $$where $\phi_i(x) = \phi(x - x_i)$ is an interpolating function.
The Coulomb potential is thus:
$$ V(\vect{X}) = -\frac{q^2}{4\pi \epsilon_0} \sum_{ijk} \int\d^3{\vect{x}'}\frac{n_{ijk}\phi_i(x')\phi_j(y')\phi_k(z')}{\abs{\vect{X}-\vect{x}'}} = -\frac{q^2}{4\pi \epsilon_0}\sum_{ijk} n_{ijk} K(\vect{X} - \vect{x}_{ijk}) $$which is a convolution that can be computed with FFT using the following kernel
$$ K(\vect{X} - \vect{x}_{ijk}) = \int\d^3{\vect{x}'}\frac{\phi(x'-x_i)\phi(y'-y_j)\phi(z'-z_k)}{\abs{\vect{X}-\vect{x}'}},\\ K(\vect{X}) = \int\d^3{\vect{x}'}\frac{\phi(x')\phi(y')\phi(z')}{\abs{\vect{X}-\vect{x}'}},\\ $$If one uses periodic basis functions here, then this reduces to the standard periodic computation, but if one uses localized basis functions without periodic images, then the computation is for free boundary conditions. A technique for efficiently evaluating the kernel is discussed in Genovese:2006.
Unfortunately, the kernel needs to be evaluated over a box of size $(2L)^3$. While this can be implemented with 8 FFTs, it is still not an optimal solution.
Another related approach is described in Lee:2008b which solves the Coulomb problem in the Sinc-function DVR basis. The evaluation of the integrals, however, requires some complicated numerical work and we have not explored this yet.
A hybrid solution is to expand the charge distribution in the principle cell in terms of a multipole expansion. Then, a few low-order terms of this expansion can be used to compute the Coulomb potential with free boundary conditions, and this low-order distribution can be subtracted, with the residual calculated using the FFT. The idea here is that higher multipoles fall off with higher powers of distance, so the problem due to images is only significant for the lowest multipoles which we compute analytically.
The question here is if the processing of the multipoles will be competative performance-wise with the other methods.
The Coulomb potential has the following Multipole expansion:
$$ V(\vect{y}) = \frac{1}{4\pi \epsilon_0}\sum_{l=0}^{\infty}\sum_{m=-l}^{l} \frac{4\pi}{2l+1}q_{lm}\frac{Y_{lm}(\Omega)}{r^{l+1}},\\ q_{lm} = \int Y^*_{lm}(\Omega)r^l n(\vect{x})\d^3{\vect{x}}. $$Since our points are tabulated on a grid, we use the cartesian form up to thq quadrupole moment:
$$ V(\vect{y}) = \frac{1}{4\pi \epsilon_0}\left[ \frac{q}{r} + \frac{\vect{p}\cdot\vect{y}}{r^3} + \frac{1}{2}\frac{\vect{y}\cdot\mat{Q}\cdot\vect{y}}{r^5} + \cdots \right],\\ Q(\vect{y}) = \begin{aligned} q &= \int n(\vect{x})\d^3{\vect{x}}, & \vect{p} &= \int n(\vect{x})\vect{x}\d^3{\vect{x}}, & [\mat{Q}]_{ij} &= \int (3x_i x_j - r^2\delta_{ij}) n(\vect{x})\d^3{\vect{x}}, \end{aligned} $$Here we propose a combination of the Truncated Kernel method and the Multipole Expansion method. The idea is to use the Truncated Kernel method on a subgrid small enough to fit into memory, and use this to correct the images.
Here is a simple example with a top-hat density distribution taking $q=\epsilon_0=1$:
$$ n(r) = \begin{cases} 1 & r \leq R\\ 0 & r < R \end{cases}, \qquad V(r) = \begin{cases} \frac{3R^2-r^2}{6} & r \leq R\\ \frac{R^3}{3 r} & r > R \end{cases} $$Here is another example with a Gaussian charge distribution:
\begin{align} n(r) &= e^{-r^2/2a^2}\\ V(r) &= \int\d^3{y} \frac{n(\abs{r - y})}{4\pi y}\\ &= 2\pi\int_0^{\infty}y^2\d{y}\int_{-1}^{1}\d\chi\; \frac{n(\sqrt{y^2 - 2yr\chi + r^2})}{4\pi y}\\ &= \int_0^{\infty}y^2\d{y} \int_{-1}^{1}\d\chi\;e^{yr\chi/a^2} \frac{e^{-(y^2 + r^2)/2a^2}}{2y}\\ &= \int_0^{\infty}y^2\d{y} \frac{2 a^2 \sinh\frac{yr}{a^2}}{yr} \frac{e^{-(y^2 + r^2)/2a^2}}{2y}\\ &= \frac{a^2}{r}e^{-r^2/2a^2} \int_0^{\infty}\d{y} \sinh\frac{ry}{a^2}e^{-y^2/2a^2}\\ &= \frac{a^3}{r} \sqrt{\frac{\pi}{2}}\erf\frac{r}{a\sqrt{2}} \end{align}a = 1.0
L = 16.0 # Ensures that n(L/2) < 1e-12
L = 20.0 # Ensures that n(L/2) < 1e-12
def n(r, a=a):
"""Gaussian charge distribution"""
return np.exp(-(r/a)**2/2)
Q = (a*np.sqrt(2*np.pi))**3 # Total charge
def V_n(r, a=a):
"""Exact Coulomb potential corresponding to n(R)"""
r = r + 1e-12
return a**3/r * np.sqrt(np.pi/2.0) * sp.special.erf(r/a/np.sqrt(2))
def V_monopole(r, a=a):
"""Exact Coulomb potential corresponding to point charge at 0"""
Q = (a*np.sqrt(2*np.pi))**3
r = r + 1e-12
return Q/4/np.pi/r
def Vk(k):
"""Fourier transform of Coulomb potential $1/4\pi r$"""
return np.where(k==0, 0, np.divide(1.0, k**2))
r = np.linspace(0,L,100)
plot(r, V_n(r), label='exact')
plot(r, V_monopole(r), 'r', label='monopole')
ylim([0,1]); xlabel('r'); ylabel('V(r)');axvline(L/2.0, c='lightgrey');legend()
twinx()
semilogy(r, abs(V_n(r) - V_monopole(r)), 'g--')
ylabel('Err')
<matplotlib.text.Text at 0x10e8ce1d0>
Since the charge distribution virtually vanishes at the boundaries of the box and we have a spherical distribution, the monopole approximation is essentially exact outside of the cell.
N = 32
dx = L/N
x = np.arange(N) * (L/N) - L/2
k = 2*np.pi * fftfreq(N, L/N)
ix, iy, iz = (slice(None), None, None), (None, slice(None), None), (None, None, slice(None))
(X, Kx), (Y, Ky), (Z, Kz) = [(x[_i], k[_i]) for _i in [ix, iy, iz]]
R = np.sqrt(X**2 + Y**2 + Z**2)
K = np.sqrt(Kx**2 + Ky**2 + Kz**2)
V_exact = V_n(R)
n_R = n(R)
V_R = ifftn(Vk(K)*fftn(n_R))
assert np.allclose(Q, n_R.sum() * dx**3)
-c:24: RuntimeWarning: divide by zero encountered in divide
iRs = np.argsort(R.ravel())
def flat(R, iRs=iRs):
return R.ravel()[iRs]
plot(flat(R), flat(n_R), 'y-', label="Charge")
plot(flat(R), flat(V_R), 'r-', label="Periodic")
plot(flat(R), flat(V_exact), 'g:', label="Exact")
plot(flat(R), flat(V_exact - V_R), 'b--', label="Images")
axvline(L/2.0, c='lightgrey')
plt.xlabel('r')
plt.legend()
<matplotlib.legend.Legend at 0x10f198bd0>
We implement the simplest form of the Turncated Kernel method here by simply padding the function and computing the full FFT in the larger space. This is not appropriate for large problems as the padded array will take $3^d$ times more space but allows the FFTW (if used) to take advantage of multiple cores. One can exactly compute the same result with $3^d$ separate FFTs which makes much better use of memory – this is discussed below. Note: one might be able to make use of the results of Roberts:2011 and Bowman:2011 to improve the speed of this convolution, but those algorithms are structured for only a simple padding of twice the dimension. (This would work with the box truncated kernel, but no analytic form for the Fourier transform of this is known.)
def bcast(n, N):
"""Use this to broadcast a 1D array along the n'th of N dimensions"""
inds = [None]*N
inds[n] = slice(None)
return inds
def coulomb3(rho, L):
"""Compute the Coulomb potential using the Truncated Kernel
method by padding into the (3N)^3 space."""
L = np.asarray(L, dtype=double) #length of the cell
N = np.asarray(rho.shape) #number of lattice
D = np.sqrt((L**2).sum()) #diameter
K = [2*np.pi * fft.fftfreq(3*_N, d=_L/_N)[bcast(_n, len(N))] #momtentum spectrum
for _n, (_N, _L) in enumerate(zip(N, L))]
Rho = np.zeros(3*N, dtype=rho.dtype)
inds = [slice(0,_N) for _N in N]
Rho[inds] = rho
k = np.sqrt(sum(_K**2 for _K in K))
i0 = np.where(k == 0)
C_k = (1 - np.cos(D*k))/k**2
C_k[i0] = 0.5*D**2 #L'hospital rule for C when k=0
V = fft.ifftn(C_k * fft.fftn(Rho))[inds]
return V
V_tk = coulomb3(n_R, (L,)*3)
semilogy(flat(R), flat(abs(V_R-V_exact)), 'y-', label="Periodic")
semilogy(flat(R), flat(abs(V_tk - V_exact)), 'r--', label="Truncated Kernel")
xlabel('r'); ylabel('V err'); legend()
-c:21: RuntimeWarning: invalid value encountered in divide
<matplotlib.legend.Legend at 0x10f1cff10>
Ther errors here are due to the discretization: use a finer lattice and they will get smaller.
Here we do a simple monopole correction. We use the FFT to compute the potential due to point charges at the center of all cells and then subtract this from the periodic solution, adding back the simple term by hand. To regularize this, we use a Gaussian charge distribution, but with a different radius. Of course, if $b=a$ then this will be exact.
As a diagnostic tool we explicitly compute the potential from the nearest images.
import itertools
def V_images(level=1, V=V_n):
return sum(V(np.sqrt((X-L*nx)**2 + (Y-L*ny)**2 + (Z-L*nz)**2))
for nx, ny, nz in itertools.product(np.arange(-level, level+1), repeat=3)
if not(nx == 0 and ny == 0 and nz == 0))
def V_images(level=1, V=V_n):
res = 0*R
for nx, ny, nz in itertools.product(np.arange(-level, level+1), repeat=3):
if nx == 0 and ny == 0 and nz == 0:
continue
_R = np.sqrt((X-L*nx)**2 + (Y-L*ny)**2 + (Z-L*nz)**2)
res += V(_R)
return res
plot(flat(R), flat(V_images(level=1)), 'g-', label="Images")
xlabel('r');ylabel('V(r)');legend()
<matplotlib.legend.Legend at 0x10f1b0f50>
This confirms that the contribution from the images is a constant in the unit cell to within a few percent.
b = 0.5
n_monopole_R = n(R, a=b)
fract = Q/(n_monopole_R.sum() * dx**3)
n_monopole_R *= fract
assert np.allclose((n_R - n_monopole_R).sum(), 0)
V_monopole_R = ifftn(Vk(K)*fftn(n_monopole_R)) - fract*V_n(R, a=b)
semilogy(flat(R), flat(abs(V_R - V_exact)), 'y-', label="Periodic")
semilogy(flat(R), flat(abs(V_R - V_monopole_R - V_exact)), 'r--', label="Corrected")
xlabel('r'); ylabel('V err'); legend()
<matplotlib.legend.Legend at 0x10f459f50>
I do not understand the source of the error here. It is roughly constant (independent of $r$) so I thought it might come from an overall offset (i.e. perhaps the total charge is not computed accurately due to the discretization), but it turns out to be insensitive to the lattice spacing. Since it has no $r$ dependence I thought that it cannot be from multipoles... but what is it?
Now we try the same expansion, but with an offset charge distribution. This will include a dipole term, hence the monopole subtraction will not be exact. (Note: we keep the charge the same so we can use the same monopole correction.)
offset = 0.1*L
Ra = np.sqrt(X**2 + Y**2 + (Z+offset)**2)
Rb = np.sqrt(X**2 + Y**2 + (Z-offset)**2)
n2_R = (n(Ra) + n(Rb))/2; assert np.allclose(Q, (n2_R.sum() * dx**3))
V2_exact = (V_n(Ra) + V_n(Rb))/2
V2_R = ifftn(Vk(K)*fftn(n2_R))
semilogy(flat(R), flat(abs(V2_R - V_exact)), 'y-', label="Periodic")
semilogy(flat(R), flat(abs(V2_R - V_monopole_R - V_exact)), 'r-', label="Monopole Corrected")
xlabel('r');ylabel('V error');legend()
<matplotlib.legend.Legend at 0x10f4c2910>
Finally, we use a symmetric configurations of Gaussians to kill the dipole terms to check that we obtain better convergence.
n8_R = 0*R
V8_exact = 0*R
_n = 0
for _o in [-offset, offset]:
for _R2 in [(X+_o)**2 + Y**2 + Z**2,
X**2 + (Y+_o)**2 + Z**2,
X**2 + Y**2 + (Z+_o)**2]:
_R = np.sqrt(_R2)
n8_R += n(_R)
V8_exact += V_n(_R)
_n += 1
n8_R /= _n; assert np.allclose(Q, (n8_R.sum() * dx**3))
V8_exact /= _n
V8_R = ifftn(Vk(K)*fftn(n8_R))
semilogy(flat(R), flat(abs(V8_R - V_exact)), 'y-', label="Periodic")
semilogy(flat(R), flat(abs(V8_R - V_monopole_R - V_exact)), 'r-', label="Monopole Corrected")
xlabel('r');ylabel('V error');legend()
<matplotlib.legend.Legend at 0x117119e50>
The predominant error here is from the missing dipole correction from the images. We still have the same constant offset error that I do not understand.
The idea here is similar to the Monopole Expansion correction, but we do this using the Truncated Kernel method on a subgrid. This should be similar to including a very high order multipole expansion. First we need a method for resampling a periodic function. This is based on scipy.signal.resample
but omits windowing support and generalizes to the nD case.
Summary: This method works quite well except for one difficulty - resampling the potential from the subgrid to the full grid. The resample
method treats the function as if it were periodic, which of course it is not (the whole point of this). One can do a little better with a cubic spline interpolation, but this is still not optimal.
def resample(f, N):
"""Resample f to a new grid of size N"""
newshape = np.array(f.shape)
newshape[:] = N
fk = np.fft.fftn(f)
fk1 = np.zeros(newshape, dtype=complex)
for _s in itertools.product(*
((slice(0, (_N + 1) // 2),
slice(-(_N - 1) // 2, None))
for _N in np.minimum(f.shape, newshape))):
fk1[_s] = fk[_s]
return np.fft.ifftn(fk1) * np.prod(newshape.astype(float)/f.shape)
# Use the dipole or octopole problem
n_R, V_R, V_exact = n2_R, V2_R, V2_exact
#n_R, V_exact = n8_R, V8_R, V8_exact
Ls = (L,)*3
N0 = N//2 # Size of subgrid
n_Rtk0 = resample(n_R, N0) # Smoothed density on subgrid
n_Rtk = resample(n_Rtk0, N) # Smooth density resampled up to large grid
semilogy(flat(R), flat(abs(n_Rtk - n_R)), '.')
xlabel('r'); ylabel('density difference');title('Smoothed density residual')
<matplotlib.text.Text at 0x113f47910>
We now compute the Coulomb potential (without images) for this smoothed density. The errors at this point are due to the interpolation.
V_tk = resample(coulomb3(n_Rtk0, Ls), N) # periodic interpolation of a non-periodic function
V_tk1 = coulomb3(n_Rtk, Ls) # Too expensive, but this is what we want.
V_corr = ifftn(Vk(K)*fftn(n_R - n_Rtk)) + V_tk
V_corr1 = ifftn(Vk(K)*fftn(n_R - n_Rtk))+ V_tk1
#assert np.allclose(V_tk, coulomb3(resample(n_Rtk, N), (L,)*3))
semilogy(flat(R), flat(abs(V_R - V_exact)), 'y-', label="Periodic")
semilogy(flat(R), flat(abs(V_tk - V_exact)), 'b:', label="Truncated Kernel")
semilogy(flat(R), flat(abs(V_corr - V_exact)), 'r:', label="Corrected")
semilogy(flat(R), flat(abs(V_corr1 - V_exact)), 'g:', label="Desired Result")
xlabel('r'); ylabel('V err'); legend()
<matplotlib.legend.Legend at 0x1168805d0>
Here we show two corrections. Both corrections have the downsampled density removed from the periodic computation which should remove the majority of the images, but the poorly corrected version has the non-periodic downsampled potential upsampled using the periodic resample
method.
Note: I have resampled at a sublattice with half as many points for demonstration purposes as this sublattice will exactly contain point of the original lattice. In production code one would probably use N//3
so that the size of the padded array is (roughly) the same as the original.
#semilogy(flat(R), flat(abs(coulomb3(resample(n_Rtk, N), Ls) - V_tk)))
print x
print scipy.signal.resample(rand(N), N//2, t=x)[1]
def upsample1(f):
res = np.zeros(2*np.asarray(f.shape), dtype=f.dtype)
res[::2] = f
res[1:-1:2] = (f[:-1] + f[1:])/2.0
res[-1] = (3*f[-1] - f[-2])/2
return res
def upsampleDVR(f, _basis=scipy.sinc(N//2 * (x[:,None] - x[::2][None, :])/L)):
"""Upsample with `_bases` (Sinc DVR by default)"""
return np.dot(_basis, f)
[-10. -9.375 -8.75 -8.125 -7.5 -6.875 -6.25 -5.625 -5. -4.375 -3.75 -3.125 -2.5 -1.875 -1.25 -0.625 0. 0.625 1.25 1.875 2.5 3.125 3.75 4.375 5. 5.625 6.25 6.875 7.5 8.125 8.75 9.375] [-10. -8.75 -7.5 -6.25 -5. -3.75 -2.5 -1.25 0. 1.25 2.5 3.75 5. 6.25 7.5 8.75]
_x = X.ravel()
_y_exact = V_exact[16,16,:]
# Here are the results on the sublattice before resampling
_x0 = _x[::2]
_y0 = coulomb3(n_Rtk0, Ls)[16//2,16//2,:]
# And the previous results
_y1 = V_tk1[16,16,:]
semilogy(_x, abs(_y_exact - resample(_y1, N)), 'b:+', label='Periodic')
semilogy(_x, abs(_y_exact - upsampleDVR(_y0)), 'g:+', label='DVR')
semilogy(_x, abs(_y_exact - upsample1(_y0)), ':+', label='linear')
from scipy.interpolate import interp1d
semilogy(_x, abs(_y_exact - interp1d(_x0, _y0, bounds_error=False, kind=3)(_x)), 'r-',
label='Cubic')
legend()
<matplotlib.legend.Legend at 0x1169e86d0>
/data/apps/anaconda/1.3.1/lib/python2.7/site-packages/matplotlib/scale.py:90: RuntimeWarning: invalid value encountered in less_equal mask = a <= 0.0
Here we simply demonstrate that one can use the FFT to interpolate/extrapolate between lattices by truncating or padding the transform. This is done by the function scipy.signal.resample
. We code a version here for fast ND resampling.
N0 = 16
N1 = 31
L = 1.0
dx0 = L/N0
x0 = np.arange(N0)*dx0 - L/2
dx1 = L/N1
x1 = np.arange(N1)*dx1 - L/2
f0 = np.exp(np.sin(2*np.pi*x0))
f1 = np.exp(np.sin(2*np.pi*x1))
plot(x0, f0)
plot(x1, f1)
# Interpolate to a finer grid
plot(x1, resample(f0, N1))
plot(x0, resample(f1, N0), '.')
[<matplotlib.lines.Line2D at 0x116c04590>]
semilogy(x1, abs(resample(resample(f1, N0), N1) - f1))
[<matplotlib.lines.Line2D at 0x1169dc650>]
Here we setup a simple problem in one dimension. We shall use a Gaussian charge distribution $n(x) = \exp(-x^2/2)$ with a kernel $K(x) = 1/(1+x^2)$:
\begin{align} n(x) &= e^{-\frac{x^2}{2a^2}}, & K(x) &= 1/(1+x^2), & \tilde{K}(k) &= \pi e^{-\abs{k}}, & V(y) &= \int\d{x}\;n(x)K(y-x) = \int\d{x}\;\frac{e^{-\frac{x^2}{2}}}{1+(y-x)^2}. \end{align}These will be evaluated on a regular lattice $x_n = an$ with interpolating functions
$$ \phi(x) = \sinc(\pi x/a). $$The Fourier coefficients for $\tilde{K}$ are
$$ \tilde{K}_{k} = \pi a e^{-\abs{k}}. $$def K(x):
return 1/(1+x**2)
def n(x):
return np.exp(-x**2/2)
N = 1024
L = 20.0
a = L/N
x = np.arange(N) * a - L/2.0
k = 2*np.pi*np.fft.fftfreq(N, d=a)
V_exact = np
X = x[:,None]
KX = a/(1+(x[:,None]-x[None,:])**2)
V = dot(KX, n(x))
Kk = np.pi*exp(-abs(k))
KXk = np.pi*a*np.exp(-abs(k))
V1 = np.fft.ifft(np.fft.fft(n(x))*KXk)
Vp = np.fft.ifft(np.fft.fft(n(x))*np.fft.fft(K(x)))
plot(x, V*20, x, V1.real*N, x, Vp)
[<matplotlib.lines.Line2D at 0x116e80ad0>, <matplotlib.lines.Line2D at 0x116e80d50>, <matplotlib.lines.Line2D at 0x116e8f790>]
Consider 1D. We need to convolve the kernel $C(x)$ with a function $n(x)$ that has compact support:
$$ V(x) = \int \d{y}\; C(y-x)n(y) = \int\frac{\d{k}}{2\pi} e^{\I k x}\tilde{C}(k)\tilde{n}(k) $$In a periodic box of length $3L$ this becomes
$$ V(x_m) = \frac{1}{3N}\sum_{n=0}^{3N-1} e^{\I K_n x_m}\tilde{C}_n\tilde{n}(K_n)\\ K_n = \frac{2\pi n}{3L},\\ \tilde{n}(K_n) = \sum_{m} e^{-\I K_n x_m}n(x_m). $$where $\tilde{C}_n$ are known analytically for the Kernel specified above.
Instead of performing the FFT in a $3L$ box, one can instead perform several FFTs in the original $L$ box. To see this, partition the momenta indices as $n \rightarrow 3n + l$ so we can isolate the momenta components $k_n$ that will arise from a Fourier transform in the original periodic box $K_{3n+l} = k_n + \delta_l$ where $k_n = 2\pi n/L$ and $\delta_l = 2\pi l/3L$:
$$ V(x_m) = \frac{1}{3N}\sum_{l=0}^{2} \sum_{n=0}^{N-1} e^{\I (k_n + \delta_l) x_m}\tilde{C}_{3n+l}\tilde{n}(k_n + \delta_l) = \frac{1}{3N}\sum_{l=0}^{2}e^{\I \delta_l x_m} \sum_{n=0}^{N-1} e^{\I k_n x_m}\tilde{C}_{3n+l}\tilde{n}(k_n + \delta_l) \\ \tilde{n}(k_n + \delta_l) = \sum_{m} e^{-\I k_n x_m}e^{-\I \delta_l x_m}n(x_m). $$The simplification is that, in the last line, the Fourier components of $n(x)$ in the larger box can be computed in terms of the Fourier transform of the function $\exp(-\I\delta_l x)n(x)$ in the small box. Thus, instead of a $(3L)^3$ FFT which would cost $O[(3N)^3\log (3N)^3] = O[81 N^3\log(3N)]$, we have 27 $(L)^3$ FFTs which would cost $O[27 N^3\log N^3] = O[81 N^3\log N]$.
After the fact, this does NOT look like a significant performance boost. We need to profile.
In general, this is all one can hope for, but in this particular case
Other approaches might use the fact that the Coulomb potential satisfies the Poisson equation:
$$ \nabla^2 V(\vect{r}) = \frac{-q^2}{\epsilon_0} n(\vect{r}), \qquad V(\infty) = 0. $$Multigrid methods are extremely efficient here, but the implementation of the boundary conditions is quite challenging.
#%%script maple
#assume(a>0);
#int(sinh(r*y/a^2)*exp(-y^2/2/a^2), y=0..infinity);
%pylab inline
Populating the interactive namespace from numpy and matplotlib
Here is a brief demonstration of how to write a function of long period $C(x+3L)=C(x)$ in terms of periodic functions $C_l(x+L) = C_l(x)$:
$$ C(x) = \sum_{l=0}^{2} e^{\I\delta_l x}C_l(x)\\ C_l(x) = \frac{1}{3}\sum_{n=0}^{2} e^{-\I\delta_l (x + nL)}C(x + nL) $$L = 1.0
def C(x):
"""A periodic function with period 3"""
x = x % (3*L)
return exp(x)-10
def delta(l):
return 2.0*np.pi*l/3.0/L
def Cl(x, l):
"""Components of C with period 1"""
return 1./3 * exp(-1j*delta(l)*x)*sum(exp(-1j*delta(l)*n*L) * C(x+n*L) for n in xrange(3))
x = linspace(0,10,100)
plot(x, C(x), 'b')
Cls = array([Cl(x,_l) for _l in xrange(3)]).T
plot(x, Cls.real, 'y', x, Cls.imag, 'y:')
plot(x, abs(C(x) - sum(exp(1j*delta(_l)*x)*Cl(x,_l) for _l in xrange(3))), 'r')
[<matplotlib.lines.Line2D at 0x116a7b150>]
def C(x):
"""A periodic function with period 3"""
x = x % 3
return exp(x)-10
def Cd(x, d):
"""Components of C with period 1"""
return sum(exp(2j*pi/3.0*d*(x+_n)) * C(x+_n) for _n in xrange(3))
x = linspace(0,10,100)
plot(x, C(x), 'b')
plot(x, abs(C(x) - sum(exp(-2j*pi/3.0*_d*x)*Cd(x,_d)/3 for _d in xrange(3))), 'r')
[<matplotlib.lines.Line2D at 0x10e43ed50>]
def C(x):
"""A periodic function with period 3"""
x = x % 3
return exp(x)-10
def Cl(x, l):
"""Components of C with period 1"""
delta_l = 2*np.pi*l
return 1.0/3*sum(exp(1j*delta_l*_n*(x+_n)) * C(x+_n) for _n in xrange(3))
x = linspace(0,10,100)
plot(x, C(x), 'b')
plot(x, abs(C(x) - sum(exp(-2j*pi/3.0*_l*x)*Cl(x,_l) for _l in xrange(3))), 'r')
[<matplotlib.lines.Line2D at 0x1175d5f90>]
Here we try three different ways of doing this:
coulomb1
: Uses the $(3L)^3$ box directly.coulomb2
: Uses a loop. This should be the most memory efficient of all the algorithms.coulomb3
: Organizes the loop as extra dimensions in a 6 dimensional array. This does not save on memory, but allows the FFTW to use multiple cores.To keep this all straight note that in 3D we label indices as $x_{abc}$ for coordinates; $k_{lmn}$ for momenta (in original cube), and $\delta_{ijk}$ where the matrices ($F^{-1}$) $F$ take the (inverse) FFT:
$$ V_{abc} = \frac{1}{3^3}\sum_{ijk} e^{\I (\delta_i x_a + \delta_j x_b + \delta_k x_c)} F^{-1}_{al}F^{-1}_{bm}F^{-1}_{cn}\tilde{C}_{ijk;lmn}\tilde{n}_{ijk;lmn}. $$where
$$ \tilde{n}_{ijk;lmn} = F_{la}F_{mb}F_{nc} e^{-\I(\delta_i x_a + \delta_j x_b + \delta_k x_c)} n_{abc} $$from scipy.special import erf
import itertools
class Gaussian(object):
"""Gaussian charge distribution."""
def __init__(self, R=1.0, n_R=1e-12):
self.R = R
self.a = np.sqrt(-R**2/(2.0*np.log(n_R)))
def n(self, r):
return np.exp(-(r/self.a)**2/2.0)
def V(self, r):
r = np.maximum(r, np.finfo(float).eps)
return np.sqrt(np.pi/2)*np.divide(self.a**3*erf(r/self.a/np.sqrt(2)), r)
class TopHat(object):
"""Uniform tophat distribution."""
def __init__(self, R=1.0):
self.R = R
def n(self, r):
return np.where(r<self.R, 1.0, 0.0)
def V(self, r):
return np.where(
r < self.R,
(3*self.R**2 - r**2)/6,
np.divide(self.R**3, 3*r))
def bcast(n, N):
"""Use this to broadcast a 1D array along the n'th of N dimensions"""
inds = [None]*N
inds[n] = slice(None)
return inds
def coulomb3(rho, L):
"""Compute the Coulomb potential by padding into the (3N)^3 space."""
L = np.asarray(L, dtype=double) #length of the cell
N = np.asarray(rho.shape) #number of lattice
D = np.sqrt((L**2).sum()) #diameter
K = [2*np.pi * fft.fftfreq(3*_N, d=_L/_N)[bcast(_n, len(N))] #momtentum spectrum
for _n, (_N, _L) in enumerate(zip(N, L))]
Rho = np.zeros(3*N, dtype=rho.dtype)
inds = [slice(0,_N) for _N in N]
Rho[inds] = rho
k = np.sqrt(sum(_K**2 for _K in K))
i0 = np.where(k == 0)
C_k = (1 - np.cos(D*k))/k**2
C_k[i0] = 0.5*D**2 #L'hospital rule for C when k=0
V = fft.ifftn(C_k * fft.fftn(Rho))[inds]
return V
def coulomb(rho, L):
"""Compute the Coulomb potential without padding."""
# We add the delta components as new dimensions so
# we can use SMP to do the FFT's in parallel.
# For performance, the new dimensions should be first
# to the FFT has contiguous arrays.
dim = len(L)
L = np.asarray(L, dtype=double)
N = np.asarray(rho.shape)
D = np.sqrt((L**2).sum())
K = [2*np.pi * fft.fftfreq(_N, d=_L/_N)
[bcast(dim + _n, 2*dim)]
for _n, (_N, _L) in enumerate(zip(N, L))]
X = [_L/_N * np.arange(_N)
[bcast(dim + _n, 2*dim)]
for _n, (_N, _L) in enumerate(zip(N, L))]
delta = [(2*np.pi*np.arange(3)/3.0/_L)
[bcast(_n, 2*dim)]
for _n, _L in enumerate(L)]
exp_delta = np.exp(1j*sum(_d*_x for _d, _x in zip(delta, X)))
rho_delta = (exp_delta.conj()
* rho[(None,)*dim + (slice(None),)*dim])
k = np.sqrt(sum((_k + _d)**2 for _k, _d in zip(K, delta)))
C = np.where(k==0, D**2/2.0, (1 - np.cos(D*k))/(k**2))
_axes = dim + np.arange(dim)
V = (exp_delta
* fft.ifftn(C * fft.fftn(rho_delta,
axes=_axes),
axes=_axes)
).sum(axis=tuple(range(dim)))/dim**3
return V
def coulomb2(rho, L):
"""Compute the Coulomb potential without padding.
This version just does the simple loop. This will
save memory but will not use multiple cores.
"""
dim = len(L)
L = np.asarray(L, dtype=double)
N = np.asarray(rho.shape)
D = np.sqrt((L**2).sum())
K = [2*np.pi * fft.fftfreq(_N, d=_L/_N)[bcast(_n, dim)]
for _n, (_N, _L) in enumerate(zip(N, L))]
X = [_L/_N * np.arange(_N)[bcast(_n, dim)]
for _n, (_N, _L) in enumerate(zip(N, L))]
V = 0
for l in itertools.product(np.arange(3), repeat=dim):
delta = [2*np.pi*_l/3.0/_L for _l, _L in zip(l, L)]
exp_delta = np.exp(1j*sum(_d*_x for _x, _d in zip(X, delta)))
rho_delta = (exp_delta.conj() * rho)
k = np.sqrt(sum((_k + _d)**2 for _k, _d in zip(K, delta)))
C = np.where(k==0, D**2/2.0, (1 - np.cos(D*k))/(k**2))
V += (exp_delta * fft.ifftn(C * fft.fftn(rho_delta)))
return V/dim**3
rho = rand(4,4,4); L = [1,1,1]
X1 = coulomb(rho, L)
X2 = coulomb2(rho, L)
X3 = coulomb3(rho, L)
assert np.allclose(X1, X2)
assert np.allclose(X1, X3)
-c:77: RuntimeWarning: invalid value encountered in divide -c:106: RuntimeWarning: invalid value encountered in divide -c:49: RuntimeWarning: invalid value encountered in divide
import IPython
print IPython.kernel.get_connection_file()
/Users/mforbes/.ipython/profile_default/security/kernel-d82a34e3-bf0a-44bb-9ce7-333e8688aa94.json
V = TopHat(R=1.0)
#V = Gaussian(R=1.0)
N = 16*4
L = 2.0*V.R
dx = L/N
V_0 = V.V(0)
x = (arange(N) - N//2)*dx
X = x[:, None, None]
Y = x[None, :, None]
Z = x[None, None, :]
R = np.sqrt(X**2 + Y**2 + Z**2)
rho = V.n(R)
r = np.array(sorted(R.ravel()))
Vn = coulomb(rho, (L,)*3)
assert np.allclose(*(_c(rho, (L,)*3) for _c in (coulomb, coulomb3)))
_i = np.argsort(R.ravel())
plot(r, Vn.ravel()[_i]/V_0, 'b.')
plot(r, rho.ravel()
[_i], 'y')
plot(r, V.V(r)/V_0, 'g')
#xlim(0, sqrt(3)*V.R)
plt.xlabel('r')
-c:29: RuntimeWarning: divide by zero encountered in divide
<matplotlib.text.Text at 0x117ac9150>
%timeit coulomb(rho, (L,)*3)
%timeit coulomb2(rho, (L,)*3)
%timeit coulomb3(rho, (L,)*3)
1 loops, best of 3: 1.91 s per loop 1 loops, best of 3: 1.74 s per loop 1 loops, best of 3: 1.66 s per loop
V = TopHat(R=1.0)
#V = Gaussian(R=1.0)
N = 16
L = 6.0*V.R
dx = L/N
D = sqrt(3.0)*L/3
print 2*V.R + sqrt(3)*dx, D
V_0 = V.V(0)
x = (arange(N) - N//2)*dx
kx = 2*pi * fft.fftfreq(N, d=dx)
kx = where(kx == 0, 0.0001, kx)
X = x[:, None, None]
Y = x[None, :, None]
Z = x[None, None, :]
R = np.sqrt(X**2 + Y**2 + Z**2)
n = V.n(R)
kX = kx[:, None, None]
kY = kx[None, :, None]
kZ = kx[None, None, :]
k2 = (kX**2 + kY**2 + kZ**2)
V_k = (1-cos(D*sqrt(k2)))/k2
#V_k = 1.0/k2
r = np.array(sorted(R.ravel()))
Vn = fft.ifftn(V_k * fft.fftn(n))
_i = np.argsort(R.ravel())
plot(r, Vn.ravel()[_i]/V_0, 'b.')
plot(r, n.ravel()[_i], 'y')
plot(r, V.V(r)/V_0, 'g')
xlim(0, 2*V.R)
2.64951905284 3.46410161514
N = 16*3
L = 6.0*3
D = L/3.0
dx = L/N
x = (arange(N) - N//2)*dx
kx = 2*pi * fft.fftfreq(N, d=dx)
kx = where(kx == 0, 1/finfo(float).eps, kx)
X = x[:, None, None]
Y = x[None, :, None]
Z = x[None, None, :]
kX = kx[:, None, None]
kY = kx[None, :, None]
kZ = kx[None, None, :]
k2 = (kX**2 + kY**2 + kZ**2)
V_k = 4*pi * (1-cos(D*sqrt(k2)))/k2
R = sqrt(X**2 + Y**2 + Z**2)
n = where(R<1, 1.0, 0.0)
V = fft.ifftn(V_k * fft.fftn(n))
plot(R.ravel(), V.ravel(), '.')
[<matplotlib.lines.Line2D at 0x10ed96690>]
N = 64
L = 10.0
dz = L/N
z = (arange(N) - N//2)*dz
n = where(abs(z)<1, 1, 0)
plot(z, n);xlabel('$z$');ylabel('$n(z)$')
k = 2*pi * fft.fftfreq(N, d=dz)
k2 = k**2;
V_k = where(k2 == 0, 0, 1/k2)
V = fft.ifft(fft.fft(n) * V_k)
plot(z, V)
-c:9: RuntimeWarning: divide by zero encountered in divide
[<matplotlib.lines.Line2D at 0x1175e0bd0>]
N = 64*3
L = 10.0*3
D = L
dz = L/N
z = (arange(N) - N//2)*dz
n = where(abs(z)<1, 1, 0)
plot(z, n);xlabel('$z$');ylabel('$n(z)$')
k = 2*pi * fft.fftfreq(N, d=dz)
k2 = k**2;
V_k = where(k2 == 0, 0, (1 + cos(D*k))/k2)
V = fft.ifft(fft.fft(n) * V_k)
plot(z, V)
-c:10: RuntimeWarning: divide by zero encountered in divide
[<matplotlib.lines.Line2D at 0x1183f0e50>]
k = np.arange(64)*1.0
K = k[:,None,None]**2 + k[None, :,None]**2 + k[None, None, :]**2
%timeit np.ma.divide(1.0, K).filled(0)
%timeit np.where(K==0, 0, np.divide(1.0,K))
np.allclose(np.ma.divide(1.0, K).filled(0),
np.where(K==0, 0, np.divide(1.0,K)))
100 loops, best of 3: 3.72 ms per loop 100 loops, best of 3: 5.96 ms per loop
-c:257: RuntimeWarning: divide by zero encountered in divide -c:4: RuntimeWarning: divide by zero encountered in divide
True
A = rand(64,64,64)
%timeit np.zeros(A.shape, dtype=A.dtype)
%timeit np.zeros_like(A)
%timeit 0*A
10000 loops, best of 3: 65.6 µs per loop 10000 loops, best of 3: 101 µs per loop 10000 loops, best of 3: 157 µs per loop