基于VGG16神经网络实现图像艺术风格转换

基本原理

通过vgg16或其他神经网络提取图像特征,并使用格拉姆矩阵(Gram matrix)进行图像风格的迁移。

VGG16

不必多说,2014年ImageNet图像分类竞赛亚军,定位竞赛冠军;VGG网络采用连续的小卷积核(3x3)和池化层构建深度神经网络,网络深度可以达到16层或19层,其中VGG16和VGG19最为著名。VGG16和VGG19网络架构非常相似,都由多个卷积层和池化层交替堆叠而成,最后使用全连接层进行分类。两者的区别在于网络的深度和参数量,VGG19相对于VGG16增加了3个卷积层和一个全连接层,参数量也更多。

可在keras直接使用vgg16/19源码,自动下载相关预训练模型

1
2
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19

这里结合transform,在torch中构建神经网络

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import torch
from collections import namedtuple
from torchvision import models
import torch.nn as nn
import torch.nn.functional as F


# VGG16神经网络定义
class VGG16(torch.nn.Module):
"""Vgg16 Net"""
def __init__(self, requires_grad=False):
super(VGG16, self).__init__()
vgg_pretrained_features = models.vgg16(pretrained=True).features
self.slice1 = torch.nn.Sequential()
self.slice2 = torch.nn.Sequential()
self.slice3 = torch.nn.Sequential()
self.slice4 = torch.nn.Sequential()

for x in range(4):
self.slice1.add_module(str(x), vgg_pretrained_features[x])

for x in range(4, 9):
self.slice2.add_module(str(x), vgg_pretrained_features[x])

for x in range(9, 16):
self.slice3.add_module(str(x), vgg_pretrained_features[x])

for x in range(16, 23):
self.slice4.add_module(str(x), vgg_pretrained_features[x])

if not requires_grad:
for param in self.parameters():
param.requires_grad = False

def forward(self, X):
h = self.slice1(X)
h_relu1_2 = h
h = self.slice2(h)
h_relu2_2 = h
h = self.slice3(h)
h_relu3_3 = h
h = self.slice4(h)
h_relu4_3 = h

vgg_outputs = namedtuple("VggOutputs", ["relu1_2", "relu2_2", "relu3_3", "relu4_3"])
output = vgg_outputs(h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3)

return output


class TransformerNet(torch.nn.Module):
def __init__(self):
super(TransformerNet, self).__init__()
self.model = nn.Sequential(
ConvBlock(3, 32, kernel_size=9, stride=1),
ConvBlock(32, 64, kernel_size=3, stride=2),
ConvBlock(64, 128, kernel_size=3, stride=2),
ResidualBlock(128),
ResidualBlock(128),
ResidualBlock(128),
ResidualBlock(128),
ResidualBlock(128),
ConvBlock(128, 64, kernel_size=3, upsample=True),
ConvBlock(64, 32, kernel_size=3, upsample=True),
ConvBlock(32, 3, kernel_size=9, stride=1, normalize=False, relu=False),
)

def forward(self, x):
return self.model(x)


class ResidualBlock(torch.nn.Module):
def __init__(self, channels):
super(ResidualBlock, self).__init__()
self.block = nn.Sequential(
ConvBlock(channels, channels, kernel_size=3, stride=1, normalize=True, relu=True),
ConvBlock(channels, channels, kernel_size=3, stride=1, normalize=True, relu=False),
)

def forward(self, x):
return self.block(x) + x


class ConvBlock(torch.nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, upsample=False, normalize=True, relu=True):
super(ConvBlock, self).__init__()
self.upsample = upsample
self.block = nn.Sequential(
nn.ReflectionPad2d(kernel_size // 2),
nn.Conv2d(in_channels, out_channels, kernel_size, (stride,))
)
self.norm = nn.InstanceNorm2d(out_channels, affine=True) if normalize else None
self.relu = relu

def forward(self, x):
if self.upsample:
x = F.interpolate(x, scale_factor=2)
x = self.block(x)
if self.norm is not None:
x = self.norm(x)
if self.relu:
x = F.relu(x)
return x


"""
测试模型
"""
if __name__ == '__main__':
input1 = torch.rand([224, 3, 224, 224])
model_x = VGG16()
print(model_x)

格拉姆矩阵

格拉姆矩阵(Gram matrix)即n维欧式空间中任意k个向量之间两两的内积所组成的矩阵,是一个对称矩阵。

更直观的理解:

输入图像的feature map为[ ch, h, w]。我们经过flatten(即是将hw进行平铺成一维向量)和矩阵转置操作,可以变形为[ ch, hw]和[ h*w, ch]的矩阵。再对两个作内积得到格拉姆矩阵。

使用格拉姆矩阵进行风格迁移:

1.准备目标图像和目标风格图像;

2.使用深层网络加白噪声提取目标图像和风格目标的特征向量。对两个图像的特征向量计算格拉姆矩阵,以矩阵差异最小化为优化目标,不断调整目标图像,使风格不断相似。

torch中格拉姆矩阵代码:

1
2
3
4
5
6
def gram_matrix(y):
(b, c, h, w) = y.size()
features = y.view(b, c, w * h)
features_t = features.transpose(1, 2)
gram = features.bmm(features_t) / (c * h * w)
return gram

开始训练

准备训练文件和风格图片,例如随机图像*20和梵高名作星月夜

utils.py工具

配置训练参数:

1
2
3
4
5
6
7
8
9
10
11
12
13
parser = argparse.ArgumentParser(description="Parser 4 Training")
parser.add_argument("--style", type=str, default="images/styles/the_starry_night.jpg", help="Path 2 style image")
parser.add_argument("--dataset", type=str, help="path 2 training dataset")
parser.add_argument("--epochs", type=int, default=1, help="Number of training epochs")
parser.add_argument("--batch_size", type=int, default=4, help="Batch size 4 training")
parser.add_argument("--image_size", type=int, default=256, help="Size of training images")
parser.add_argument("--style_size", type=int, help="Size of style image")
parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate")
parser.add_argument("--lambda_img", type=float, default=1e5, help="Weight 4 image loss")
parser.add_argument("--lambda_style", type=float, default=1e10, help="Weight 4 style loss")
parser.add_argument("--model_path", type=str, help="Optional path 2 checkpoint model")
parser.add_argument("--model_checkpoint", type=int, default=1000, help="Batches 4 saving model")
parser.add_argument("--result_checkpoint", type=int, default=1000, help="Batches 4 saving image result")

使用神经网络进行风格训练

1
2
3
4
5
6
7
8
9
10
def train_transform(image_size):
transform = transforms.Compose(
[
transforms.Resize(int(image_size * 1.15)),
transforms.RandomCrop(image_size),
transforms.ToTensor(),
transforms.Normalize(mean, std),
]
)
return transform

使用神经网络进行风格转换

1
2
3
4
def style_transform(image_size=None):
resize = [transforms.Resize(image_size)] if image_size else []
transform = transforms.Compose(resize + [transforms.ToTensor(), transforms.Normalize(mean, std)])
return transform

使用均值和标准对图像张量进行反规范化

1
2
3
4
def denormalize(tensors):
for c in range(3):
tensors[:, c].mul_(std[c]).add_(mean[c])
return tensors

train.py训练脚本

训练配置

1
2
3
4
5
6
7
8
train_args = TrainArgs()
args = train_args.initialize().parse_args()

args.dataset = './dataset'
args.style = './images/styles/the_starry_night.jpg'
args.epochs = 2400 # epochs*(数据集/batch_size)是1000的公倍数
args.batch_size = 4
args.image_size = 256

训练流程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
style_name = args.style.split("/")[-1].split(".")[0]
os.makedirs(f"images/train/{style_name}_training", exist_ok=True)
os.makedirs(f"checkpoints", exist_ok=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_dataset = datasets.ImageFolder(args.dataset, train_transform(args.image_size))
dataloader = DataLoader(train_dataset, batch_size=args.batch_size)
transformer = TransformerNet().to(device)
vgg = VGG16(requires_grad=False).to(device)
if args.model_path:
transformer.load_state_dict(torch.load(args.model_path))
optimizer = Adam(transformer.parameters(), args.lr)
l2_loss = torch.nn.MSELoss().to(device)
style = style_transform(args.style_size)(Image.open(args.style))
style = style.repeat(args.batch_size, 1, 1, 1).to(device)
features_style = vgg(style)
gram_style = [gram_matrix(y) for y in features_style]
image_samples = []
for path in random.sample(glob.glob(f"{args.dataset}/*/*"), len(train_dataset)):
image_samples += [style_transform(args.image_size)(Image.open(path).resize((224, 224)))]
image_samples = torch.stack(image_samples)

启动训练

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def save_result(sample):
transformer.eval()
with torch.no_grad():
output = transformer(image_samples.to(device))
image_rgb = denormalize(torch.cat((image_samples.cpu(), output.cpu()), 2))
save_image(image_rgb, f"images/train/{style_name}_training/{sample}.jpg", nrow=4)
transformer.train()


def save_model(sample):
torch.save(transformer.state_dict(), f"checkpoints/{style_name}_{sample}.pth")


for epoch in range(args.epochs):
for line in range(len(dataloader)):
batch_i = line
batches_done = epoch * len(dataloader) + batch_i + 1
images = list(dataloader)[line][0]
optimizer.zero_grad()

images_original = images.to(device)
images_transformed = transformer(images_original)

features_original = vgg(images_original)
features_transformed = vgg(images_transformed)

img_loss = args.lambda_img * l2_loss(features_transformed.relu2_2, features_original.relu2_2)

style_loss = 0
for ft_y, gm_s in zip(features_transformed, gram_style):
gm_y = gram_matrix(ft_y)
style_loss += l2_loss(gm_y, gm_s[: images.size(0), :, :])
style_loss *= args.lambda_style

total_loss = img_loss + style_loss
total_loss.backward()
optimizer.step()
if batches_done % args.result_checkpoint == 0:
save_result(batches_done)
if args.model_checkpoint > 0 and batches_done % args.model_checkpoint == 0:
save_model(batches_done)

第1000次迭代

第12000次迭代(2400epoch * (20/batch_size)),效果明显

到这一步,训练结束,可以预测结果

预测:

配置预测参数

1
2
3
4
predict_args = PredictArgs()
args = predict_args.initialize().parse_args()
args.image_path = './images/input/001.jpg'
args.model_path = './checkpoints/the_starry_night_12000.pth'

预测代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
os.makedirs("images/output", exist_ok=True)
device = torch.device('cpu')#("cuda" if torch.cuda.is_available() else "cpu")
transform = style_transform()
transformer = TransformerNet().to(device)
transformer.load_state_dict(torch.load(mod_path))
transformer.eval()
image_tensor = Variable(transform(Image.open(img_path))).to(device)
image_tensor = image_tensor.unsqueeze(0)

with torch.no_grad():
output_image = denormalize(transformer(image_tensor)).cpu()

name = img_path.split("/")[-1]
save_image(output_image, f"images/output/output_{name}")

思路·参考

https://github.com/elleryqueenhomels/fast_neural_style_transfer/tree/master

https://github.com/AaronJny/DeepLearningExamples/tree/master/tf2-neural-style-transfer

https://github.com/Huage001/PaintTransformer

https://github.com/eriklindernoren/Fast-Neural-Style-Transfer/tree/master

https://github.com/NeverGiveU/PaintTransformer-Pytorch-master

https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix

Powered by Hexo and Hexo-theme-hiker

Copyright © 2017 - 2024 青域 All Rights Reserved.

UV : | PV :