The purpose of this code is to demonstrate the seamless image cloning algorithm on the GPU. See [1] for details. To solve the sparse least-squares problem resulting from the algorithm in [1], we use a geometric Jacobi method inspired by [2].
[1] http://www.cs.jhu.edu/~misha/Fall07/Papers/Perez03.pdf
[2] http://http.developer.nvidia.com/GPUGems/gpugems_ch38.html
import PIL
import PIL.Image
import scipy
import scipy.misc
destination = PIL.Image.open("Destination.png")
destination = numpy.array(destination)
source = PIL.Image.open("Source.png")
source = numpy.array(source)
mask = PIL.Image.open("Mask.png")
mask = numpy.array(mask)
mask[ mask != 0 ] = 1
dny = destination.shape[0]
dnx = destination.shape[1]
sny = source.shape[0]
snx = source.shape[1]
naive_clone = destination.copy()
naive_clone[mask == 1] = source[mask == 1]
figsize(19,4)
matplotlib.pyplot.subplot(141)
matplotlib.pyplot.imshow(destination);
matplotlib.pyplot.title("destination");
matplotlib.pyplot.subplot(142)
matplotlib.pyplot.imshow(source);
matplotlib.pyplot.title("source");
matplotlib.pyplot.subplot(143)
matplotlib.pyplot.imshow(mask);
matplotlib.pyplot.title("mask");
matplotlib.pyplot.subplot(144)
matplotlib.pyplot.imshow(naive_clone);
matplotlib.pyplot.title("naive_clone");
import skimage
import skimage.morphology
strict_interior = skimage.morphology.erosion(mask, numpy.ones((3,3), dtype=numpy.uint8))
strict_interior_indices = strict_interior.nonzero()
num_strict_interior_pixels = strict_interior_indices[0].shape[0]
border = mask - strict_interior
figsize(9,4)
matplotlib.pyplot.subplot(121);
matplotlib.pyplot.imshow(strict_interior, interpolation="nearest");
matplotlib.pyplot.title("strict_interior");
matplotlib.pyplot.subplot(122);
matplotlib.pyplot.imshow(border, interpolation="nearest");
matplotlib.pyplot.title("border");
We compute a seamless clone by solving for the unknown pixel values inside a masked region. The seamless cloning algorithm operates only greyscale images, so we apply it each color channel independently. For each pixel location $p$ in the strict interior of the masked region $\Omega$, we relate the intensities of the unknown, source, and destination pixel values as follows:
$$|N_p|f_p - \sum_{q\in N_p \cap \Omega} f_q = \sum_{q\in N_p \cap \delta \Omega} f_q^* + \sum_{q\in N_p} (g_p - g_q)$$where:
$N_p$ is the set of $p$'s 4-connected neighbors (note that $|N_p|$ might be less than 4 if $\Omega$ extends to the image border);
$f_p$ is the unknown intensity value at $p$;
$f_q^*$ is the known intensity value from the destination image in the border of the masked region $\delta \Omega$; and
$g_p$ is the known intensity value from the source image at $p$, and $g_q$ is the known intensity value from the source image at $q \in N_p$
Rearranging the equation above, we get:
$$f_p = \frac{ \sum_{q\in N_p \cap \Omega} f_q + \sum_{q\in N_p \cap \delta \Omega} f_q^* + \sum_{q\in N_p} (g_p - g_q) }{ |N_p| }$$This equation forms the basis of our iterative algorithm. At time $t=0$, we initialize $f^0=0$. Then at each subsequent time $t+1$, we compute $f^{t+1}$ using the values of $f$ from the previous timestep as follows:
$$f_p^{t+1} = \frac{ \sum_{q\in N_p \cap \Omega} f_q^t + \sum_{q\in N_p \cap \delta \Omega} f_q^* + \sum_{q\in N_p} (g_p - g_q) }{ |N_p| }$$Since the linear system above is strictly diagonally dominant, our iterative algorithm is guaranteed to converge for any value of $f^0$.
import scipy
import scipy.sparse
import scipy.sparse.linalg
f_star = destination[:,:,0].copy()
g = source[:,:,0].copy()
f_current = numpy.zeros_like(mask, dtype=numpy.float32)
f_next = numpy.zeros_like(mask, dtype=numpy.float32)
f_current[border == 1] = f_star[border == 1]
f_next[border == 1] = f_star[border == 1]
num_iterations = 500
for n in range(num_iterations):
for i in range(num_strict_interior_pixels):
y = strict_interior_indices[0][i]
x = strict_interior_indices[1][i]
x_right = x+1
x_left = x-1
y_up = y-1
y_down = y+1
x_neighbors = []
y_neighbors = []
if x_right < dnx:
y_neighbors.append(y)
x_neighbors.append(x_right)
if y_up >= 0:
y_neighbors.append(y_up)
x_neighbors.append(x)
if x_left >= 0:
y_neighbors.append(y)
x_neighbors.append(x_left)
if y_down < dny:
y_neighbors.append(y_down)
x_neighbors.append(x)
y_neighbors = numpy.array(y_neighbors)
x_neighbors = numpy.array(x_neighbors)
strict_interior_neighbors = (strict_interior[(y_neighbors,x_neighbors)] == 1).nonzero()
border_neighbors = (strict_interior[(y_neighbors,x_neighbors)] == 0).nonzero()
num_neighbors = y_neighbors.shape[0]
sum_f_current_strict_interior_neighbors = \
numpy.sum(f_current[(y_neighbors[strict_interior_neighbors],x_neighbors[strict_interior_neighbors])])
sum_f_star_border = numpy.sum(f_star[(y_neighbors[border_neighbors],x_neighbors[border_neighbors])])
sum_g = (num_neighbors * g[y,x]) - numpy.sum(g[(y_neighbors, x_neighbors)])
f_next_value = (sum_f_current_strict_interior_neighbors + sum_f_star_border + sum_g) / num_neighbors
f_next[y,x] = numpy.clip(f_next_value, 0.0, 255.0)
f_current, f_next = f_next, f_current
seamless_clone_cpu = destination[:,:,0].copy()
seamless_clone_cpu[strict_interior_indices] = f_current[strict_interior_indices]
figsize(4,4)
matplotlib.pyplot.imshow(seamless_clone_cpu, cmap="gray", interpolation="nearest");
matplotlib.pyplot.title("seamless_clone_cpu");
import pycuda.autoinit
import pycuda.driver
import pycuda.compiler
source_module = pycuda.compiler.SourceModule \
(
"""
#define BLOCK_SIZE_Y 8
#define BLOCK_SIZE_X 32
#define MAX_NUM_NEIGHBORS 4
__global__ void compute_strict_interior_and_border(
unsigned char* d_mask,
unsigned char* d_strict_interior,
unsigned char* d_border,
int num_pixels_y,
int num_pixels_x )
{
__shared__ int s_neighbors[ BLOCK_SIZE_Y ][ BLOCK_SIZE_X ][ MAX_NUM_NEIGHBORS ];
int ny = num_pixels_y;
int nx = num_pixels_x;
int2 index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int index_1d = ( nx * index_2d.y ) + index_2d.x;
if ( index_2d.x < nx && index_2d.y < ny )
{
int2 index_2d_right = make_int2( index_2d.x + 1, index_2d.y );
int2 index_2d_up = make_int2( index_2d.x, index_2d.y - 1 );
int2 index_2d_left = make_int2( index_2d.x - 1, index_2d.y );
int2 index_2d_down = make_int2( index_2d.x, index_2d.y + 1 );
int num_neighbors = 0;
if ( index_2d_right.x < nx )
{
int index_1d_right = ( nx * index_2d_right.y ) + index_2d_right.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_right;
num_neighbors++;
}
if ( index_2d_up.y >= 0 )
{
int index_1d_up = ( nx * index_2d_up.y ) + index_2d_up.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_up;
num_neighbors++;
}
if ( index_2d_left.x >= 0 )
{
int index_1d_left = ( nx * index_2d_left.y ) + index_2d_left.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_left;
num_neighbors++;
}
if ( index_2d_down.y < ny )
{
int index_1d_down = ( nx * index_2d_down.y ) + index_2d_down.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_down;
num_neighbors++;
}
unsigned char mask = d_mask[ index_1d ];
if ( mask == 1 )
{
bool all_neighbor_masks_set = true;
for ( int i = 0; i < num_neighbors; i++ )
{
unsigned char neighbor_mask = d_mask[ s_neighbors[ threadIdx.y ][ threadIdx.x ][ i ] ];
if ( neighbor_mask == 0 )
{
all_neighbor_masks_set = false;
}
}
if ( all_neighbor_masks_set )
{
d_strict_interior[ index_1d ] = 1;
d_border[ index_1d ] = 0;
}
else
{
d_strict_interior[ index_1d ] = 0;
d_border[ index_1d ] = 1;
}
}
else
{
d_border[ index_1d ] = 0;
d_strict_interior[ index_1d ] = 0;
}
}
}
__global__ void compute_seamless_clone_iteration(
unsigned char* d_f_star,
unsigned char* d_g,
unsigned char* d_strict_interior,
unsigned char* d_border,
float* d_f_current,
float* d_f_next,
int num_pixels_y,
int num_pixels_x )
{
__shared__ int s_neighbors[ BLOCK_SIZE_Y ][ BLOCK_SIZE_X ][ MAX_NUM_NEIGHBORS ];
int ny = num_pixels_y;
int nx = num_pixels_x;
int2 index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int index_1d = ( nx * index_2d.y ) + index_2d.x;
if ( index_2d.x < nx && index_2d.y < ny )
{
int2 index_2d_right = make_int2( index_2d.x + 1, index_2d.y );
int2 index_2d_up = make_int2( index_2d.x, index_2d.y - 1 );
int2 index_2d_left = make_int2( index_2d.x - 1, index_2d.y );
int2 index_2d_down = make_int2( index_2d.x, index_2d.y + 1 );
int num_neighbors = 0;
if ( index_2d_right.x < nx )
{
int index_1d_right = ( nx * index_2d_right.y ) + index_2d_right.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_right;
num_neighbors++;
}
if ( index_2d_up.y >= 0 )
{
int index_1d_up = ( nx * index_2d_up.y ) + index_2d_up.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_up;
num_neighbors++;
}
if ( index_2d_left.x >= 0 )
{
int index_1d_left = ( nx * index_2d_left.y ) + index_2d_left.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_left;
num_neighbors++;
}
if ( index_2d_down.y < ny )
{
int index_1d_down = ( nx * index_2d_down.y ) + index_2d_down.x;
s_neighbors[ threadIdx.y ][ threadIdx.x ][ num_neighbors ] = index_1d_down;
num_neighbors++;
}
unsigned char strict_interior = d_strict_interior[ index_1d ];
if ( strict_interior == 1 )
{
float sum_f_current_strict_interior_neighbors = 0.0f;
for ( int i = 0; i < num_neighbors; i++ )
{
unsigned char neighbor_strict_interior = d_strict_interior[ s_neighbors[ threadIdx.y ][ threadIdx.x ][ i ] ];
if ( neighbor_strict_interior == 1 )
{
sum_f_current_strict_interior_neighbors += d_f_current[ s_neighbors[ threadIdx.y ][ threadIdx.x ][ i ] ];
}
}
float sum_f_star_border = 0.0f;
for ( int i = 0; i < num_neighbors; i++ )
{
unsigned char neighbor_border = d_border[ s_neighbors[ threadIdx.y ][ threadIdx.x ][ i ] ];
if ( neighbor_border == 1 )
{
sum_f_star_border += d_f_star[ s_neighbors[ threadIdx.y ][ threadIdx.x ][ i ] ];
}
}
float sum_g = 0.0f;
sum_g += 4.0f * d_g[ index_1d ];
for ( int i = 0; i < num_neighbors; i++ )
{
sum_g -= d_g[ s_neighbors[ threadIdx.y ][ threadIdx.x ][ i ] ];
}
float f_next_value = (sum_f_current_strict_interior_neighbors + sum_f_star_border + sum_g) / (float)num_neighbors;
d_f_next[ index_1d ] = min( 255.0f, max( 0.0f, f_next_value ) );
}
}
}
__global__ void compute_seamless_clone_composite(
unsigned char* d_f_star,
float* d_f,
unsigned char* d_strict_interior,
unsigned char* d_seamless_clone,
int num_pixels_y,
int num_pixels_x )
{
int ny = num_pixels_y;
int nx = num_pixels_x;
int2 index_2d = make_int2( ( blockIdx.x * blockDim.x ) + threadIdx.x, ( blockIdx.y * blockDim.y ) + threadIdx.y );
int index_1d = ( nx * index_2d.y ) + index_2d.x;
if ( index_2d.x < nx && index_2d.y < ny )
{
unsigned char strict_interior = d_strict_interior[ index_1d ];
if ( strict_interior == 1 )
{
d_seamless_clone[ index_1d ] = (unsigned char)d_f[ index_1d ];
}
else
{
d_seamless_clone[ index_1d ] = d_f_star[ index_1d ];
}
}
}
"""
)
compute_strict_interior_and_border_function = source_module.get_function("compute_strict_interior_and_border")
num_pixels_y = numpy.int32(mask.shape[0])
num_pixels_x = numpy.int32(mask.shape[1])
compute_strict_interior_and_border_function_block = (32,8,1)
num_blocks_y = int(ceil(float(num_pixels_y) / float(compute_strict_interior_and_border_function_block[1])))
num_blocks_x = int(ceil(float(num_pixels_x) / float(compute_strict_interior_and_border_function_block[0])))
compute_strict_interior_and_border_function_grid = (num_blocks_x, num_blocks_y)
compute_seamless_clone_iteration_function = source_module.get_function("compute_seamless_clone_iteration")
num_pixels_y = numpy.int32(mask.shape[0])
num_pixels_x = numpy.int32(mask.shape[1])
compute_seamless_clone_iteration_function_block = (32,8,1)
num_blocks_y = int(ceil(float(num_pixels_y) / float(compute_seamless_clone_iteration_function_block[1])))
num_blocks_x = int(ceil(float(num_pixels_x) / float(compute_seamless_clone_iteration_function_block[0])))
compute_seamless_clone_iteration_function_grid = (num_blocks_x, num_blocks_y)
compute_seamless_clone_composite_function = source_module.get_function("compute_seamless_clone_composite")
num_pixels_y = numpy.int32(mask.shape[0])
num_pixels_x = numpy.int32(mask.shape[1])
compute_seamless_clone_composite_function_block = (32,8,1)
num_blocks_y = int(ceil(float(num_pixels_y) / float(compute_seamless_clone_composite_function_block[1])))
num_blocks_x = int(ceil(float(num_pixels_x) / float(compute_seamless_clone_composite_function_block[0])))
compute_seamless_clone_composite_function_grid = (num_blocks_x, num_blocks_y)
f_star_gpu = numpy.zeros_like(mask)
g_gpu = numpy.zeros_like(mask)
mask_gpu = mask.copy()
strict_interior_gpu = numpy.zeros_like(mask)
border_gpu = numpy.zeros_like(mask)
f_gpu = numpy.zeros_like(mask, dtype=numpy.float32)
seamless_clone_gpu = numpy.zeros_like(mask)
f_star_device = pycuda.driver.mem_alloc(f_star_gpu.nbytes)
g_device = pycuda.driver.mem_alloc(g_gpu.nbytes)
mask_device = pycuda.driver.mem_alloc(mask_gpu.nbytes)
strict_interior_device = pycuda.driver.mem_alloc(strict_interior_gpu.nbytes)
border_device = pycuda.driver.mem_alloc(border_gpu.nbytes)
f_current_device = pycuda.driver.mem_alloc(f_gpu.nbytes)
f_next_device = pycuda.driver.mem_alloc(f_gpu.nbytes)
seamless_clone_device = pycuda.driver.mem_alloc(seamless_clone_gpu.nbytes)
pycuda.driver.memcpy_htod(mask_device, mask_gpu)
pycuda.driver.memcpy_htod(strict_interior_device, strict_interior_gpu)
pycuda.driver.memcpy_htod(border_device, border_gpu)
pycuda.driver.memcpy_htod(f_current_device, f_gpu)
pycuda.driver.memcpy_htod(f_next_device, f_gpu)
pycuda.driver.memcpy_htod(seamless_clone_device, seamless_clone_gpu)
compute_strict_interior_and_border_function(
mask_device,
strict_interior_device,
border_device,
num_pixels_y,
num_pixels_x,
block=compute_strict_interior_and_border_function_block,
grid=compute_strict_interior_and_border_function_grid)
f_star_gpu = destination[:,:,0].copy()
g_gpu = source[:,:,0].copy()
f_gpu = numpy.zeros_like(mask, dtype=numpy.float32)
pycuda.driver.memcpy_htod(f_star_device, f_star_gpu)
pycuda.driver.memcpy_htod(g_device, g_gpu)
pycuda.driver.memcpy_htod(f_current_device, f_gpu)
pycuda.driver.memcpy_htod(f_next_device, f_gpu)
num_iterations = 500
for n in range(num_iterations):
compute_seamless_clone_iteration_function(
f_star_device,
g_device,
strict_interior_device,
border_device,
f_current_device,
f_next_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_iteration_function_block,
grid=compute_seamless_clone_iteration_function_grid)
f_current_device, f_next_device = f_next_device, f_current_device
compute_seamless_clone_composite_function(
f_star_device,
f_current_device,
strict_interior_device,
seamless_clone_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_composite_function_block,
grid=compute_seamless_clone_composite_function_grid)
pycuda.driver.memcpy_dtoh(seamless_clone_gpu, seamless_clone_device)
figsize(15,15)
matplotlib.pyplot.imshow(seamless_clone_gpu, cmap="gray", interpolation="nearest");
matplotlib.pyplot.title("seamless_clone_gpu");
To verify the correctness of our GPU implementation, we run the CPU and GPU algorithms for 500 iterations. This is too few iterations to achieve a visually plausible result, but it is sufficient to verify correctness.
diff = numpy.abs(seamless_clone_cpu.astype(numpy.float32) - seamless_clone_gpu.astype(numpy.float32))
max_diff = numpy.ones_like(diff, dtype=numpy.float32) * 255
print \
"Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%%): %0.2f%%" % \
(100 * (numpy.linalg.norm(diff) / numpy.linalg.norm(max_diff)))
print
figsize(14,4)
matplotlib.pyplot.subplot(131)
matplotlib.pyplot.imshow(seamless_clone_cpu, cmap="gray", interpolation="nearest");
matplotlib.pyplot.title("seamless_clone_cpu")
matplotlib.pyplot.subplot(132)
matplotlib.pyplot.imshow(seamless_clone_gpu, cmap="gray", interpolation="nearest");
matplotlib.pyplot.title("seamless_clone_gpu")
matplotlib.pyplot.subplot(133)
matplotlib.pyplot.imshow(diff, cmap="gray", interpolation="nearest");
matplotlib.pyplot.title("diff")
matplotlib.pyplot.colorbar();
Difference between CPU and GPU naive results as a percentage of the maximum possible difference (should be less than 1%): 0.16%
seamless_clone_gpu_r = numpy.zeros_like(mask)
seamless_clone_gpu_g = numpy.zeros_like(mask)
seamless_clone_gpu_b = numpy.zeros_like(mask)
compute_strict_interior_and_border_function(
mask_device,
strict_interior_device,
border_device,
num_pixels_y,
num_pixels_x,
block=compute_strict_interior_and_border_function_block,
grid=compute_strict_interior_and_border_function_grid)
num_iterations = 5000
#
# r
#
f_star_gpu = destination[:,:,0].copy()
g_gpu = source[:,:,0].copy()
f_gpu = numpy.zeros_like(mask, dtype=numpy.float32)
pycuda.driver.memcpy_htod(f_star_device, f_star_gpu)
pycuda.driver.memcpy_htod(g_device, g_gpu)
pycuda.driver.memcpy_htod(f_current_device, f_gpu)
pycuda.driver.memcpy_htod(f_next_device, f_gpu)
for n in range(num_iterations):
compute_seamless_clone_iteration_function(
f_star_device,
g_device,
strict_interior_device,
border_device,
f_current_device,
f_next_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_iteration_function_block,
grid=compute_seamless_clone_iteration_function_grid)
f_current_device, f_next_device = f_next_device, f_current_device
compute_seamless_clone_composite_function(
f_star_device,
f_current_device,
strict_interior_device,
seamless_clone_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_composite_function_block,
grid=compute_seamless_clone_composite_function_grid)
pycuda.driver.memcpy_dtoh(seamless_clone_gpu_r, seamless_clone_device)
#
# g
#
f_star_gpu = destination[:,:,1].copy()
g_gpu = source[:,:,1].copy()
f_gpu = numpy.zeros_like(mask, dtype=numpy.float32)
pycuda.driver.memcpy_htod(f_star_device, f_star_gpu)
pycuda.driver.memcpy_htod(g_device, g_gpu)
pycuda.driver.memcpy_htod(f_current_device, f_gpu)
pycuda.driver.memcpy_htod(f_next_device, f_gpu)
for n in range(num_iterations):
compute_seamless_clone_iteration_function(
f_star_device,
g_device,
strict_interior_device,
border_device,
f_current_device,
f_next_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_iteration_function_block,
grid=compute_seamless_clone_iteration_function_grid)
f_current_device, f_next_device = f_next_device, f_current_device
compute_seamless_clone_composite_function(
f_star_device,
f_current_device,
strict_interior_device,
seamless_clone_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_composite_function_block,
grid=compute_seamless_clone_composite_function_grid)
pycuda.driver.memcpy_dtoh(seamless_clone_gpu_g, seamless_clone_device)
#
# b
#
f_star_gpu = destination[:,:,2].copy()
g_gpu = source[:,:,2].copy()
f_gpu = numpy.zeros_like(mask, dtype=numpy.float32)
pycuda.driver.memcpy_htod(f_star_device, f_star_gpu)
pycuda.driver.memcpy_htod(g_device, g_gpu)
pycuda.driver.memcpy_htod(f_current_device, f_gpu)
pycuda.driver.memcpy_htod(f_next_device, f_gpu)
for n in range(num_iterations):
compute_seamless_clone_iteration_function(
f_star_device,
g_device,
strict_interior_device,
border_device,
f_current_device,
f_next_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_iteration_function_block,
grid=compute_seamless_clone_iteration_function_grid)
f_current_device, f_next_device = f_next_device, f_current_device
compute_seamless_clone_composite_function(
f_star_device,
f_current_device,
strict_interior_device,
seamless_clone_device,
num_pixels_y,
num_pixels_x,
block=compute_seamless_clone_composite_function_block,
grid=compute_seamless_clone_composite_function_grid)
pycuda.driver.memcpy_dtoh(seamless_clone_gpu_b, seamless_clone_device)
seamless_clone_gpu = numpy.concatenate((
seamless_clone_gpu_r[:,:,numpy.newaxis],
seamless_clone_gpu_g[:,:,numpy.newaxis],
seamless_clone_gpu_b[:,:,numpy.newaxis]),
axis=2).copy()
figsize(20,20)
matplotlib.pyplot.subplot(131);
matplotlib.pyplot.imshow(destination);
matplotlib.pyplot.title("destination");
matplotlib.pyplot.subplot(132);
matplotlib.pyplot.imshow(naive_clone);
matplotlib.pyplot.title("naive_clone");
matplotlib.pyplot.subplot(133);
matplotlib.pyplot.imshow(seamless_clone_gpu);
matplotlib.pyplot.title("seamless_clone_gpu");