卷积神经网络

Table of Contents

  1. 卷积概念及其在边缘检测中的应用.
  2. 池化的概念.
  3. 卷积神经网络识别手写数字.
  4. VGG.
  5. tiny VGG.

在这里插入图片描述 在这里插入图片描述

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# 1. 构造一个简单的图像
X = np.ones((6, 8))
X[:, 2:6] = 0
print(X.shape)
plt.imshow(X, cmap='gray')
plt.show()

K = np.array([[1.0, -1.0]])

# 输入图像转换格式为 [batch, height, width, channels]
# 卷积核转换格式为 [filter_height, filter_width, in_channels, out_channels]
Y = tf.nn.conv2d(X.reshape((1, 6, 8, 1)), K.reshape((1, 2, 1, 1)), strides=[1, 1, 1, 1], padding='VALID')

print(Y.shape)

plt.imshow(Y.numpy().reshape(6,7), cmap='gray')
plt.show()

#将两幅图像对比绘制
# 创建一个画布,其中包含两个子图,布局为2行1列
fig, axs = plt.subplots(2, 1, sharex=True, figsize=(10, 8))  # figsize的单位是英寸
# 绘制第一张图
axs[0].imshow(X, cmap='gray')
axs[0].set_title('X')
# 绘制第二张图
axs[1].imshow(Y.numpy().reshape(6,7), cmap='gray')
axs[1].set_title('Y')
# 显示图形
plt.show()


# 加入池化层
max_pool_2d = tf.keras.layers.MaxPooling2D(padding="same")

P=max_pool_2d(Y)
plt.imshow(P.numpy().reshape(3,4), cmap='gray')
plt.show()

# 创建一个画布,其中包含三个子图,布局为3行1列
fig, axs = plt.subplots(3, 1, sharex=True, figsize=(10, 8))  # figsize的单位是英寸

# 绘制第一张图
axs[0].imshow(X, cmap='gray')
axs[0].set_title('X')

# 绘制第二张图
axs[1].imshow(Y.numpy().reshape(6,7), cmap='gray')
axs[1].set_title('Y')

# 绘制第三张图
axs[2].imshow(P.numpy().reshape(3,4), cmap='gray')
axs[2].set_title('P')

# 显示图形
plt.show()

import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import torchvision.transforms.functional as TF

# 1. 加载图像并转为灰度(方便单通道展示)
img = Image.open('D:\\work\\mygithub\\docs\\intoAI\\src\\chapter2\\images\\intoAI_2_1.png').convert('L') 
img_tensor = TF.to_tensor(img).unsqueeze(0) # 形状变为 [1, 1, H, W]

# 2. 定义边缘检测卷积核
# 这是一个经典的 Sobel 边缘检测算子
sobel_kernel = torch.tensor([[[-1., -1., -1.],
                              [-1.,  8., -1.],
                              [-1., -1., -1.]]])

# 调整形状为 [1, 1, 3, 3] (输出通道, 输入通道, 卷积核高, 卷积核宽)
kernel = sobel_kernel.view(1, 1, 3, 3)

# 3. 进行卷积操作
# padding=1 保持图像边缘信息,stride=1 逐像素滑动
conv_output = nn.functional.conv2d(img_tensor, kernel, padding=1)

# 4. 可视化
plt.imshow(conv_output.squeeze().numpy(), cmap='gray')
plt.title("Convolution Output (Edge Detection)")
plt.show()


# 1. 对卷积后的特征图进行最大池化 (池化核大小为 2x2,步长为 2)
# 这会让图像的宽和高各缩小一半
max_pool_output = nn.functional.max_pool2d(conv_output, kernel_size=2, stride=2)

# 2. 打印形状对比
print(f"卷积后特征图形状: {conv_output.shape}")
print(f"池化后特征图形状: {max_pool_output.shape}")

# 3. 可视化
plt.imshow(max_pool_output.squeeze().numpy(), cmap='gray')
plt.title("Max Pooling Output (Downsampled)")
plt.show()

max_pool_2d = tf.keras.layers.MaxPooling2D(padding="same")

x = np.array([[1., 2., 3.],
              [4., 5., 6.],
              [7., 8., 9.]])
x = np.reshape(x, [1, 3, 3, 1])
max_pool_2d(x)

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

# 1. 加载图像并转为灰度 --> [batch, height, width, channels]
imagePath = "D:\\work\\mygithub\\docs\\intoAI\\src\\chapter2\\images\\intoAI_2_1.png"
img = tf.keras.utils.load_img(imagePath, color_mode='grayscale')
plt.imshow(img, cmap='gray')
plt.show()
X = np.array(img, dtype='int32' ).reshape(1, 28, 28, 1)

# 2. 定义边缘检测卷积核 --> [filter_height, filter_width, in_channels, out_channels]
# 这是一个经典的 Sobel 边缘检测算子
sobel_kernel = np.array([[[-1., -1., -1.],
                              [-1.,  8., -1.],
                              [-1., -1., -1.]]])
K = sobel_kernel.reshape((3, 3, 1, 1))

# 3. 进行卷积操作
Y = tf.nn.conv2d(X, K, strides=[1, 1, 1, 1], padding='VALID')
print(Y.shape)
plt.imshow(Y.numpy().reshape(26,26), cmap='gray')
plt.show()


# 4. 对卷积后的特征图进行最大池化 (池化核大小为 2x2,步长为 2)
max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2),strides=(1, 1), padding="valid")
max_pool_output = max_pool_2d(Y)

# 5. 打印形状对比
print(f"卷积后特征图形状: {Y.shape}")
print(f"池化后特征图形状: {max_pool_output.shape}")

plt.imshow(max_pool_output.numpy().reshape(25,25), cmap='gray')
plt.show()

在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述 在这里插入图片描述

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import numpy as np
import matplotlib.pyplot as plt

mnist = tf.keras.datasets.mnist
(x_train, y_train), _ = mnist.load_data()
x_train=x_train.reshape(x_train.shape[0], x_train.shape[1], x_train.shape[2], 1)

batch_size = 64
num_classes = 10
epochs = 10
img_height = 28
img_width = 28

model = Sequential([
  layers.Rescaling(1./255, input_shape=(img_height, img_width, 1)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes, name="outputs")
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.summary()

log_dir = "D:\\mnist\\logs"
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

history = model.fit(
  x_train, y_train,
  batch_size=batch_size,
  validation_split=0.2,
  epochs=epochs,
  callbacks=[tensorboard_callback]
)

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

imagePath = "D:\\work\\mygithub\\docs\\intoAI\\src\\chapter2\\images\\intoAI_2_10.bmp"
img = tf.keras.utils.load_img(imagePath, color_mode='grayscale')
print(img.size)  # 输出图像尺寸
print(img.mode)  # 输出图像模式,应为 'L' 表示灰度图像
img_array = (np.expand_dims(img,0))
print(img_array.shape)
predictions = model.predict(img_array)
print(predictions)
score = tf.nn.softmax(predictions[0])
print(np.argmax(score))

论文:Very Deep Convolutional Networks for Large-Scale Image Recognition

牛津大学VGG研究组:Visual Geometry Group

  1. VGGNet的结构非常简洁,整个网络都使用了同样大小的卷积核尺寸(3x3)和最大池化尺寸(2x2)
  2. 几个小滤波器(3x3)卷积层的组合比一个大滤波器(5x5或7x7)卷积层好
  3. 验证了通过不断加深网络结构可以提升性能

VGG16

conv1  = Conv2D(filters=64, kernel_size=(3,3), padding="same", activation="relu")(_input)
conv2  = Conv2D(filters=64, kernel_size=(3,3), padding="same", activation="relu")(conv1)
pool1  = MaxPooling2D((2, 2))(conv2)

conv3  = Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu")(pool1)
conv4  = Conv2D(filters=128, kernel_size=(3,3), padding="same", activation="relu")(conv3)
pool2  = MaxPooling2D((2, 2))(conv4)

conv5  = Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu")(pool2)
conv6  = Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu")(conv5)
conv7  = Conv2D(filters=256, kernel_size=(3,3), padding="same", activation="relu")(conv6)
pool3  = MaxPooling2D((2, 2))(conv7)

conv8  = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(pool3)
conv9  = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(conv8)
conv10 = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(conv9)
pool4  = MaxPooling2D((2, 2))(conv10)

conv11 = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(pool4)
conv12 = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(conv11)
conv13 = Conv2D(filters=512, kernel_size=(3,3), padding="same", activation="relu")(conv12)
pool5  = MaxPooling2D((2, 2))(conv13)

flat   = Flatten()(pool5)
dense1 = Dense(4096, activation="relu")(flat)
dense2 = Dense(4096, activation="relu")(dense1)
output = Dense(1000, activation="softmax")(dense2)

vgg16_model  = Model(inputs=_input, outputs=output)

tiny-vgg:CNN-explainer

import tensorflow as tf
import numpy as np
import pandas as pd
import re
from shutil import copyfile
from glob import glob
from json import load, dump
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D,Activation
from tensorflow.keras import Model, Sequential
from os.path import basename
from time import time
import os

# 数据存放的位置
os.chdir('C:\\Users\\haiye\\Desktop\\data')
print(os.getcwd())


def process_path_train(path):
    """
    Get the (class label, processed image) pair of the given image path. This
    funciton uses python primitives, so you need to use tf.py_funciton wrapper.
    This function uses global variables:

        WIDTH(int): the width of the targeting image
        HEIGHT(int): the height of the targeting iamge
        NUM_CLASS(int): number of classes

    Args:
        path(string): path to an image file
    """

    # Get the class
    path = path.numpy()
    image_name = basename(path.decode('ascii'))
    label_name = re.sub(r'(.+)_\d+\.JPEG', r'\1', image_name)
    label_index = tiny_class_dict[label_name]['index']

    # Convert label to one-hot encoding
    label = tf.one_hot(indices=[label_index], depth=NUM_CLASS)
    label = tf.reshape(label, [NUM_CLASS])

    # Read image and convert the image to [0, 1] range 3d tensor
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [WIDTH, HEIGHT])

    return(img, label)


def process_path_test(path):
    """
    Get the (class label, processed image) pair of the given image path. This
    funciton uses python primitives, so you need to use tf.py_funciton wrapper.
    This function uses global variables:

        WIDTH(int): the width of the targeting image
        HEIGHT(int): the height of the targeting iamge
        NUM_CLASS(int): number of classes

    The filepath encoding for test images is different from training images.

    Args:
        path(string): path to an image file
    """

    # Get the class
    path = path.numpy()
    image_name = basename(path.decode('ascii'))
    label_index = tiny_val_class_dict[image_name]['index']

    # Convert label to one-hot encoding
    label = tf.one_hot(indices=[label_index], depth=NUM_CLASS)
    label = tf.reshape(label, [NUM_CLASS])

    # Read image and convert the image to [0, 1] range 3d tensor
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, [WIDTH, HEIGHT])

    return(img, label)


def prepare_for_training(dataset, batch_size=32, cache=True,
                         shuffle_buffer_size=1000):

    if cache:
        if isinstance(cache, str):
            dataset = dataset.cache(cache)
        else:
            dataset = dataset.cache()

    # Only shuffle elements in the buffer size
    dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    # Pre featch batches in the background
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

    return dataset


@tf.function
def train_step(image_batch, label_batch):
    with tf.GradientTape() as tape:
        # Predict
        predictions = tiny_vgg(image_batch)

        # Update gradient
        loss = loss_object(label_batch, predictions)
        gradients = tape.gradient(loss, tiny_vgg.trainable_variables)
        optimizer.apply_gradients(zip(gradients, tiny_vgg.trainable_variables))

        train_mean_loss(loss)
        train_accuracy(label_batch, predictions)


@tf.function
def vali_step(image_batch, label_batch):
    predictions = tiny_vgg(image_batch)
    vali_loss = loss_object(label_batch, predictions)

    vali_mean_loss(vali_loss)
    vali_accuracy(label_batch, predictions)


@tf.function
def test_step(image_batch, label_batch):
    predictions = tiny_vgg(image_batch)
    test_loss = loss_object(label_batch, predictions)

    test_mean_loss(test_loss)
    test_accuracy(label_batch, predictions)
	

WIDTH = 64
HEIGHT = 64
EPOCHS = 1000
PATIENCE = 50
LR = 0.001
NUM_CLASS = 10
BATCH_SIZE = 32


# Create training and validation dataset
tiny_class_dict = load(open('./data/class_dict_10.json', 'r'))
tiny_val_class_dict = load(open('./data/val_class_dict_10.json', 'r'))

training_images = './data/class_10_train/*/images/*.JPEG'
vali_images = './data/class_10_val/val_images/*.JPEG'
test_images = './data/class_10_val/test_images/*.JPEG'

# Create training dataset
train_path_dataset = tf.data.Dataset.list_files(training_images)

train_labeld_dataset = train_path_dataset.map(
    lambda path: tf.py_function(
        process_path_train,
        [path],
        [tf.float32, tf.float32]
    )
)


# Create vali dataset
vali_path_dataset = tf.data.Dataset.list_files(vali_images)

vali_labeld_dataset = vali_path_dataset.map(
    lambda path: tf.py_function(
        process_path_test,
        [path],
        [tf.float32, tf.float32]
    )
)


# Create test dataset
test_path_dataset = tf.data.Dataset.list_files(test_images)

test_labeld_dataset = test_path_dataset.map(
    lambda path: tf.py_function(
        process_path_test,
        [path],
        [tf.float32, tf.float32]
    )
)


train_dataset = prepare_for_training(train_labeld_dataset,
                                     batch_size=BATCH_SIZE)
vali_dataset = prepare_for_training(vali_labeld_dataset,
                                    batch_size=BATCH_SIZE)
test_dataset = prepare_for_training(test_labeld_dataset,
                                    batch_size=BATCH_SIZE)
									
									
# Use Keras Sequential API instead, since it is easy to save the model
filters = 10
tiny_vgg = Sequential([
    Conv2D(filters, (3, 3), input_shape=(64, 64, 3), name='conv_1_1'),
    Activation('relu', name='relu_1_1'),
    Conv2D(filters, (3, 3), name='conv_1_2'),
    Activation('relu', name='relu_1_2'),
    MaxPool2D((2, 2), name='max_pool_1'),

    Conv2D(filters, (3, 3), name='conv_2_1'),
    Activation('relu', name='relu_2_1'),
    Conv2D(filters, (3, 3), name='conv_2_2'),
    Activation('relu', name='relu_2_2'),
    MaxPool2D((2, 2), name='max_pool_2'),

    Flatten(name='flatten'),
    Dense(NUM_CLASS, activation='softmax', name='output')
])


# "Compile" the model with loss function and optimizer
loss_object = tf.keras.losses.CategoricalCrossentropy()
# optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
optimizer = tf.keras.optimizers.SGD(learning_rate=LR)

train_mean_loss = tf.keras.metrics.Mean(name='train_mean_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')

vali_mean_loss = tf.keras.metrics.Mean(name='vali_mean_loss')
vali_accuracy = tf.keras.metrics.CategoricalAccuracy(name='vali_accuracy')


# Initialize early stopping parameters
no_improvement_epochs = 0
best_vali_loss = np.inf
start_time = time()
print('Start training.\n')

for epoch in range(EPOCHS):
    # Train
    for image_batch, label_batch in train_dataset:
        train_step(image_batch, label_batch)

    # Predict on the test dataset
    for image_batch, label_batch in vali_dataset:
        vali_step(image_batch, label_batch)

    template = 'epoch: {}, train loss: {:.4f}, train accuracy: {:.4f}, '
    template += 'vali loss: {:.4f}, vali accuracy: {:.4f}'
    print(template.format(epoch + 1,
                          train_mean_loss.result(),
                          train_accuracy.result() * 100,
                          vali_mean_loss.result(),
                          vali_accuracy.result() * 100))

    # Early stopping
    if vali_mean_loss.result() < best_vali_loss:
        no_improvement_epochs = 0
        best_vali_loss = vali_mean_loss.result()
        # Save the best model
        tiny_vgg.save('trained_vgg_best.h5')
    else:
        no_improvement_epochs += 1

    if no_improvement_epochs >= PATIENCE:
        print('Early stopping at epoch = {}'.format(epoch))
        break

    # Reset evaluation metrics
    train_mean_loss.reset_states()
    train_accuracy.reset_states()
    vali_mean_loss.reset_states()
    vali_accuracy.reset_states()

print('\nFinished training, used {:.4f} mins.'.format((time() -
                                                       start_time) / 60))


# Save trained model
tiny_vgg.save('trained_tiny_vgg.h5')
tiny_vgg = tf.keras.models.load_model('trained_vgg_best.h5')

# Test on hold-out test images
test_mean_loss = tf.keras.metrics.Mean(name='test_mean_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')

for image_batch, label_batch in test_dataset:
    test_step(image_batch, label_batch)

template = '\ntest loss: {:.4f}, test accuracy: {:.4f}'
print(template.format(test_mean_loss.result(),
                      test_accuracy.result() * 100))