21小时上手深度学习3-Neural Song

Neural Style Audio

源码python文件解析

import tensorflowas tf

import librosa

import os

from IPython.displayimport Audio, display

import numpyas np

import matplotlib.pyplotas plt

#load style and content

CONTENT_FILENAME="inputs/imperials.mp3"

STYLE_FILENAME="inputs/usa.mp3"

display(Audio(CONTENT_FILENAME))

display(Audio(STYLE_FILENAME))

#Read wav file and produces spectrum

# Fourier Phases are ignored

n_fft=2048

# 构造一个函数读音频频谱。用到了librosa库里的load和stft. stft是短时傅里叶变换。姑且看做一种构造频谱的方式

#吧。 fs是采样频率

def read_audio_spectum(filename):

x,fs=librosa.load(filename)

s=librosa.stft(x,n_fft)

p=np.angle(S)

S=np.log1p(np.abs(S[:,:430]))

return S,fs

#用之前那个函数读出来content和style的频谱再reshape一下。保证content和#style的频谱矩阵大小一样，这样方便做计算。这里是以content的频谱为基准。#也可以自己定义

a_content, fs=read_audio_spectum(CONTENT_FILENAME)

a_style, fs=read_audio_spectum(STYLE_FILENAME)

N_SAMPLES=a_content.shape[1]

N_CHANNELS=a_content.shape[0]

a_style=a_style[:N_CHANNELS, :N_SAMPLES]

# reshape a_style. to be like a_content

####### Visualize spectrograms for content and style tracks

#这里用的是plt这个包。 figure是图像，subplot是说两个图像一起显示了。

#另外还有imshow和title

plt.figure(fig size=(10,5))

plt.subplot(1,2,1)

plt.title('Content')

plt.imshow(a_content[:400,:])

plt.subplot(1,2,2)

plt.title('Style')

plt.imshow(a_style[:400,:])

plt.show()

########## Compute content and style feats##

# 这一步里面的卷积是用来抽取特征的，跟计算y无关。不属于人工智能机器学习哦

N_FILTERS =4096

#现在把a_content和a_style都变成tensor flow的输入。这里用到了

#np的ascontiguousarray 就是给转换成连续数组。后面括号里是定义了数据类型

a_content_tf = np.ascontiguousarray(a_content.T[None,None,:,:])

a_style_tf = np.ascontiguousarray(a_style.T[None,None,:,:])

# filter shape is"[filter_height, filter_width, in_channels, out_channels)

#这里构造了filter 也就是卷积核。先定义了filter的形状。 std是标准差。其实# # 我不知道为什么要这么算

# 可能是用一些噪声单元增加随机性，让生成的图像更加连贯？这个教学视频里面说过

std=np.sqrt(2)*np.sqrt(2.0 / ((N_CHANNELS + N_FILTERS) *11))

kernel = np.random.randn(1, 11, N_CHANNELS, N_FILTERS)*std

#现在开始编译运行tensor flow了 with后面描述了一些条件

g = tf.Graph()

with g.as_default(), g.device('/cpu:0'), tf.Session()as sess:

x = tf.placeholder('float32', [1,1,N_SAMPLES,N_CHANNELS], name="x")

kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')

conv = tf.nn.conv2d(

kernel_tf,

strides=[1, 1, 1, 1],

padding="VALID",

name="conv")

# data shape is "[batch, in_height, in_width, in_channels]",

#这里tensor flow的placeholder 注意和后面的variable的区别。

#跟两次用到卷积神经网络的时候 CNN的作用不同有关

#数据形状是，批次batch是1，高度是1，宽度是N_SAMPLES, 通道数是 N_CHANNELS

net = tf.nn.relu(conv)

content_features = net.eval(feed_dict={x: a_content_tf})

style_features = net.eval(feed_dict={x: a_style_tf})

features = np.reshape(style_features, (-1, N_FILTERS))

style_gram = np.matmul(features.T, features) / N_SAMPLES

# 训练开始

from sys import stderr

ALPHA =1e-2

learning_rate =1e-3

iterations =100

result =None

with tf.Graph().as_default():

# Build graph with variable input

# x = tf.Variable(np.zeros([1,1,N_SAMPLES,N_CHANNELS], dtype=np.float32), name="x")

# 这里用的是variable而不是placeholder 因为这里x是我们要训练的东西哦。

# 从初始值白噪声开始一步一步近似到生成最后结果

x = tf.Variable(np.random.randn(1, 1, N_SAMPLES, N_CHANNELS).astype(np.float32)*1e-3, name="x")

kernel_tf = tf.constant(kernel, name="kernel", dtype='float32')

conv = tf.nn.conv2d(

kernel_tf,

strides=[1, 1, 1, 1],

padding="VALID",

name="conv")

# 第一个参数x 是卷积的输入图像, 它的要求是一个tensor, 具有[batch, in_height, in_width, in_channels]这样的shape

# 具体含义是训练时候一个batch的图片数量,图片高度,图片宽度,图像通道数

# 4维度tensor. 要求类型为float32和 float64之一

# 第二个参数filter: 相当于CNN中的卷积核, 要求也是一个tensor, 具有[filter_height, filter_width, in_channels, out_channels]

# 这样的shape 具体含义是 [卷积核的高度, 卷积核的宽度,图像通道数,卷积核个数]

# 要求类型与input相同第三维是input的第四维

# 第三个参数 strides: 卷积时在图像的每一维的步长, 是一个一维向量,长度4

# 第四个参数 padding: string类型的量, 只能是"SAME" "VALID"其中之一

# 这个值决定了不同的卷积方式

# 第五个参数: use_cudnn_on——GPU bool类型. 是否使用cudnn加速, 默认为true

net = tf.nn.relu(conv)

content_loss = ALPHA *2 * tf.nn.l2_loss(net - content_features)

style_loss =0

# 内容损失函数设置完毕

_, height, width, number =map(lambda i: i.value, net.get_shape())

size = height * width * number

feats = tf.reshape(net, (-1, number))

gram = tf.matmul(tf.transpose(feats), feats) / N_SAMPLES

style_loss =2 * tf.nn.l2_loss(gram - style_gram)

# 风格损失函数设置完毕

# Overall loss

loss = content_loss + style_loss

opt = tf.contrib.opt.ScipyOptimizerInterface(loss, method='L-BFGS-B', options={'maxiter':300})

# tf.contrib的文档值得一看.里面有很多东西.这里只是用到了一些优化routine

# Optimization

with tf.Session()as sess:

sess.run(tf.initialize_all_variables())

print('Started optimization.')

opt.minimize(sess)

print('Final loss:', loss.eval())

result = x.eval()

# Invert spectrogram and save results

a = np.zeros_like(a_content)

a[:N_CHANNELS, :] = np.exp(result[0, 0].T) -1

# This code is supposed to do phase reconstruction

p =2 * np.pi * np.random.random_sample(a.shape) - np.pi

for i in range(500):

S = a * np.exp(1j*p)

x = librosa.istft(S)

p = np.angle(librosa.stft(x, n_fft_))

OUTPUT_FILENAME ='outputs/out.wav'

librosa.output.write_wav(OUTPUT_FILENAME, x, fs)

print(OUTPUT_FILENAME)

display(Audio(OUTPUT_FILENAME))

# Visualize spectrograms

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)

plt.title('Content')

plt.imshow(a_content[:400, :])

plt.subplot(1, 3, 2)

plt.title('Style')

plt.imshow(a_style[:400, :])

plt.subplot(1, 3, 3)

plt.title('Result')

plt.imshow(a[:400, :])

plt.show()