#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('autoreload', '2') import matplotlib.pyplot as plt from IPython.display import Audio, display import numpy as np from pysas import World, waveread # # Speech analysis and synthesis with World # In[2]: signal, samplingrate, _ = waveread("test/cmu_arctic/arctic_a0001.wav") # from http://festvox.org/cmu_arctic/dbs_bdl.html # In[3]: world = World(samplingrate) # In[4]: f0, spec_mat, aperiod_mat = world.analyze(signal) # In[5]: plt.plot(f0) # In[6]: spec = spec_mat[300] # In[7]: plt.plot(np.log(spec)) # In[8]: out = world.synthesis(f0, spec_mat, aperiod_mat) # In[9]: plt.plot(signal) plt.plot(out) # In[10]: display(Audio(data=np.int16(out * 32767.0), rate=16000)) # # Mel-Cepstrum Analysis # In[11]: from pysas.mcep import spec2mcep, mcep2spec, mcep2coef, coef2mcep, estimate_alpha # In[12]: alpha = round(estimate_alpha(samplingrate), 3) alpha # In[13]: mcep = spec2mcep(spec, 24, alpha) # In[14]: spec2 = mcep2spec(mcep, alpha, world.fftsize()) # In[15]: plt.plot(np.log(spec)) plt.plot(np.log(spec2[:world.envelopesize()])) # ## from fft # In[16]: i = 80* 300 windowsize = 1024 sig = signal[i:i+windowsize] * np.hanning(windowsize) power_spectrum = (np.absolute(np.fft.fft(sig)) ** 2)[:(windowsize>>1) + 1] plt.plot(np.log(power_spectrum)) # In[17]: fft_mcep = spec2mcep(power_spectrum, 20, alpha) reconst_pspec = mcep2spec(fft_mcep, alpha, windowsize) # In[18]: plt.plot(np.log(spec)) plt.plot(np.log(spec2)) plt.plot(np.log(power_spectrum)) plt.plot(np.log(reconst_pspec)) # ## Convert Mel-Cepstrum to Coefficients for MLSA Degital Filter # In[19]: coef = mcep2coef(fft_mcep, alpha) reconst_mcep = coef2mcep(coef, alpha) # In[20]: plt.plot(fft_mcep) plt.plot(reconst_mcep) plt.plot(coef) # ## Generate excited pulse from F0 sequence # In[21]: from pysas.excite import ExcitePulse # In[22]: ep = ExcitePulse(16000, 80, False) # In[23]: plt.plot(ep.gen(f0)) # In[24]: display(Audio(data=np.int16(ep.gen(f0) * 700), rate=16000)) # # Speech Synthesis WIth Mel-Cepstrum Analysis and MLSA Filter # In[25]: from pysas.synthesis.mlsa import MLSAFilter from pysas.synthesis import Synthesis from pysas.mcep import spec2mcep_from_matrix # In[26]: cpestrum_dim = 128 mcep_mat = spec2mcep_from_matrix(spec_mat, cpestrum_dim, alpha) # In[27]: coef_mat = [] for i in range(mcep_mat.shape[0]): coef_mat.append(mcep2coef(mcep_mat[i], alpha)) coef_mat = np.array(coef_mat) # In[28]: mlsa = MLSAFilter(cpestrum_dim, alpha, 5) syn = Synthesis(80, mlsa) pulse = ep.gen(f0) # In[29]: synth = syn.synthesis(pulse, coef_mat) # In[30]: plt.plot(synth) plt.plot(signal) # ### Original # In[31]: display(Audio(data=np.int16(signal * 32767.0), rate=16000)) # ### Synthesized # In[32]: display(Audio(data=np.int16(synth * 32767.0), rate=16000)) # ### voice changer # In[33]: mlsa = MLSAFilter(cpestrum_dim, alpha*1.3 , 5) syn = Synthesis(80, mlsa) pulse = ep.gen(f0 * 0.7) a = syn.synthesis(pulse, coef_mat) display(Audio(data=np.int16(a * 32767.0), rate=16000)) # In[ ]: