import pandas as pd df = pd.read_csv("kanji_list.csv", sep='\t', header=None) df.head(10) kanji = df[1].dropna().values type(kanji) kanji[:10] import matplotlib.pyplot as plt %matplotlib inline prop = fm.FontProperties(fname='ipam.ttc', size=50) plt.figure(figsize=(1, 1)) plt.text(0, 0, kanji[0], ha='center', va='center', fontproperties=prop) plt.xlim(-0.1, 0.1) plt.ylim(-0.1, 0.1) def rasterize_kanji(kanji, save_to): plt.figure(figsize=(1, 1)) prop = fm.FontProperties(fname='ipam.ttc', size=70) plt.text(0, 0, kanji[0], ha='center', va='center', fontproperties=prop) plt.xlim(-0.1, 0.1) plt.ylim(-0.1, 0.1) plt.axis("off") plt.savefig(save_to) plt.close() rasterize_kanji(kanji[0], "1.png") from IPython.display import Image Image(filename='1.png') for i, k in enumerate(kanji): rasterize_kanji(k, "img/{0:04}.png".format(i)); import numpy as np import sklearn import os from scipy import ndimage image_names = list(filter(lambda s: s.endswith('.png', 0), os.listdir('img/'))) X = np.array([ndimage.imread(os.path.join('img/', fname), flatten=True).ravel() for fname in image_names]) X.shape 72*72 plt.imshow(X[0, :].reshape((72, 72)), cmap='gray') from sklearn.decomposition import PCA pca = PCA(n_components=100) pca.fit(X) pca_score = pca.explained_variance_ratio_ V = pca.components_ plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); np.cumsum(pca_score)[-1] plt.imshow(V[0, :].reshape((72, 72)), cmap='gray') plt.colorbar() plt.title('first principal component of kanji dataset'); from random import randint def plot_random_kanji(): for i, ind in zip(range(100), np.random.choice(np.arange(X.shape[0]), 100)): plt.subplot(10, 10, i + 1) plt.imshow(X[ind, :].reshape((72, 72)), cmap='gray') plt.axis('off') plt.figure(figsize=(10, 10)) plot_random_kanji() def plot_principal_components(): for i in range(100): plt.subplot(10, 10, i + 1) plt.imshow(V[i, :].reshape((72, 72)), cmap='gray') plt.axis('off') plt.figure(figsize=(10, 10)) plot_principal_components() def decompose_character(kanji): weights = [(np.dot(kanji, V[i, :]), i) for i in range(100)] weights.sort(key=lambda s: abs(s[0]), reverse=True) for i, components in enumerate([1, 10, 50, 100]): approximation = np.zeros_like(kanji) for c in range(components): w, comp = weights[c] approximation += w * V[comp, :] plt.subplot(2, 2, i + 1) plt.imshow(approximation.reshape((72, 72)), cmap='gray') plt.axis('off') decompose_character(X[0, :]) import skimage skimage.__version__ from skimage.filters import threshold_otsu def decompose_character_threshold(kanji): weights = [(np.dot(kanji, V[i, :]), i) for i in range(100)] weights.sort(key=lambda s: abs(s[0]), reverse=True) for i, components in enumerate([1, 10, 25, 50, 100]): approximation = np.zeros_like(kanji) for c in range(components): w, comp = weights[c] approximation += w * V[comp, :] thresh = threshold_otsu(approximation) binary = approximation > thresh plt.subplot(2, 3, i + 1) plt.imshow(binary.reshape((72, 72)), cmap='gray') plt.axis('off') plt.subplot(2, 3, 6) plt.imshow(kanji.reshape((72, 72)), cmap='gray') plt.axis('off') decompose_character_threshold(X[0, :]) from IPython.html.widgets import interact interact(lambda index: decompose_character_threshold(X[index, :]), index=(0, X.shape[0] - 1)) def approximate_reconstruction(kanji, n_components): weights = [(np.dot(kanji, V[i, :]), i) for i in range(100)] weights.sort(key=lambda s: abs(s[0]), reverse=True) approximation = np.zeros_like(kanji) for c in range(n_components): w, comp = weights[c] approximation += w * V[comp, :] thresh = threshold_otsu(approximation) binary = approximation > thresh plt.imshow(binary.reshape((72, 72)), cmap='gray') plt.axis('off') plt.figure(figsize=(10, 10)) for i in range(100): plt.subplot(10, 10, i + 1) approximate_reconstruction(X[np.random.choice(np.arange(X.shape[0])), :], 100) plt.figure(figsize=(10, 10)) for i in range(100): plt.subplot(10, 10, i + 1) approximate_reconstruction(X[np.random.choice(np.arange(X.shape[0])), :], 20) means = pca.transform(X[:, :]).mean(axis=0) stds = np.std(pca.transform(X[:, :]), axis=0) new_kanji = np.zeros_like(X[0, :]) # we select between 5 and 50 random components in our resulting kanji for i in range(np.random.randint(5, 50)): component = np.random.choice(np.arange(V.shape[0])) weight = np.random.normal(means[component], stds[component]) new_kanji += weight * V[component, :] plt.imshow(new_kanji.reshape((72, 72)), cmap='gray') plt.axis('off') thresh = threshold_otsu(new_kanji) binary = new_kanji > thresh plt.imshow(binary.reshape((72, 72)), cmap='gray') plt.axis('off') new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), 1.5) plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray') plt.axis('off') def examine_smoothing(factor): new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), factor) plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray') plt.axis('off') interact(examine_smoothing, factor=(0.5, 5.5, 0.1)) new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), 1.8) new_kanji_smooth = ndimage.grey_dilation(new_kanji_smooth, size=2) new_kanji_smooth = ndimage.gaussian_filter(new_kanji_smooth, 1.8) plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray') plt.axis('off') def make_gallery(func): for i in range(100): plt.subplot(10, 10, i + 1) func() def generate_new_kanji(factor): new_kanji = np.zeros_like(X[0, :]) # we select between 5 and 50 random components in our resulting kanji for i in range(np.random.randint(5, 50)): component = np.random.choice(np.arange(V.shape[0])) weight = np.random.normal(means[component], stds[component]) new_kanji += weight * V[component, :] new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), factor) plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray') plt.axis('off') plt.figure(figsize=(10, 10)) make_gallery(lambda : generate_new_kanji(1.8)) plt.figure(figsize=(10, 10)) make_gallery(lambda : generate_new_kanji(2.3)) def generate_new_kanji_binary_op(factor): new_kanji = np.zeros_like(X[0, :]) # we select between 5 and 50 random components in our resulting kanji for i in range(np.random.randint(5, 50)): component = np.random.choice(np.arange(V.shape[0])) weight = np.random.normal(means[component], stds[component]) new_kanji += weight * V[component, :] new_kanji_smooth = ndimage.gaussian_filter(new_kanji.reshape((72, 72)), factor) new_kanji_smooth = ndimage.grey_erosion(new_kanji_smooth, size=2) new_kanji_smooth = ndimage.gaussian_filter(new_kanji_smooth, factor) plt.imshow((new_kanji_smooth > threshold_otsu(new_kanji_smooth)).reshape((72, 72)), cmap='gray') plt.axis('off') plt.figure(figsize=(10, 10)) make_gallery(lambda : generate_new_kanji_binary_op(2.1)) plt.figure(figsize=(10, 10)) make_gallery(lambda : generate_new_kanji_binary_op(2.8)) plt.figure(figsize=(10, 10)) make_gallery(lambda : generate_new_kanji_binary_op(1.5))