1. 단순한 신경망 구현: Logic Gate

# 필요 라이브러리
import numpy as np
import matplotlib.pyplot as plt
# 하이퍼 파라미터

# 몇 번 반복
epochs = 1000
lr = 0.1
# 유틸 함수들
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def mean_squared_error(pred_y, true_y):
    return 0.5 * (np.sum((true_y - pred_y)**2))

def cross_entropy_error(pred_y, true_y):
    if true_y.ndim ==1:
        true_y = true_y.reshape(1, -1)
        pred_y = pred_y.reshape(1, -1)

    delta = 1e-7
    return -np.sum(true_y * np.log(pred_y + delta))

# 배치 사이즈로 각 값들을 나눠줘야 함
def cross_entropy_error_for_batch(pred_y, true_y):
    if true_y.ndim ==1:
        true_y = true_y.reshape(1, -1)
        pred_y = pred_y.reshape(1, -1)

    delta = 1e-7
    batch_size = pred_y.shape[0]
    return -np.sum(true_y * np.log(pred_y + delta)) / batch_size

# 이진 분류일때
def cross_entropy_error_for_bin(pred_y, true_y):
    return 0.5 * np.sum((-true_y * np.log(pred_y) - (1 - true_y) * np.log(1 - pred_y)))

def softmax(a):
    exp_a = np.exp(a)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a

    return y

def differential(f, x):
    eps = 1e-5
    diff_value = np.zeros_like(x)

    for i in range(x.shape[0]):
        temp_val = x[i]

        x[i] = temp_val + eps
        f_h1 = f(x)

        x[i] = temp_val - eps
        f_h2 = f(x)

        diff_value[i] = (f_h1 - f_h2) / (2 * eps)
        x[i] = temp_val
    
    return diff_value
# 신경망
class LogicGateNet():

    def __init__(self):
        def weight_init():
            np.random.seed(1)
            weights = np.random.randn(2)
            bias = np.random.rand(1)

            return weights, bias
        
        self.weights, self.bias = weight_init()

    def predict(self, x):
        W = self.weights.reshape(-1, 1)
        b = self.bias

        pred_y = sigmoid(np.dot(x, W) + b)
        return pred_y
    
    def loss(self, x, true_y):
        pred_y = self.predict(x)
        return cross_entropy_error_for_bin(pred_y, true_y)
    
    def get_gradient(self, x, t):
        def loss_grad(grad):
            return self.loss(x, t)
        
        grad_W = differential(loss_grad, self.weights)
        grad_B = differential(loss_grad, self.bias)

        return grad_W, grad_B

 

 

  - AND 게이트

AND = LogicGateNet()

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y = np.array([[0], [0], [0], [1]])

train_loss_list = list()

for i in range(epochs):
    grad_W, grad_B = AND.get_gradient(X, Y)

    AND.weights -= lr * grad_W
    AND.bias -= lr * grad_B

    loss = AND.loss(X, Y)
    train_loss_list.append(loss)

    if i%100 == 99:
        print("Epoch: {}, Cost: {}, Weights: {}, Bias: {}".format(i+1, loss, AND.weights, AND.bias))
        
# 출력 결과
Epoch: 100, Cost: 0.6886489498071491, Weights: [1.56426876 0.79168393], Bias: [-2.14871589]
Epoch: 200, Cost: 0.4946368603064415, Weights: [2.01360719 1.71241131], Bias: [-3.07894028]
Epoch: 300, Cost: 0.3920165980757418, Weights: [2.42841657 2.29753793], Bias: [-3.79103207]
Epoch: 400, Cost: 0.3257214374791936, Weights: [2.794852   2.73235738], Bias: [-4.37257095]
Epoch: 500, Cost: 0.27863601334755067, Weights: [3.11636193 3.08408364], Bias: [-4.86571237]
Epoch: 600, Cost: 0.24328504683831248, Weights: [3.40015395 3.38235762], Bias: [-5.29433736]
Epoch: 700, Cost: 0.21572536552468008, Weights: [3.65300561 3.64264217], Bias: [-5.67349792]
Epoch: 800, Cost: 0.19363244428365756, Weights: [3.88044124 3.87412053], Bias: [-6.01340133]
Epoch: 900, Cost: 0.1755321312790001, Weights: [4.08680123 4.08279091], Bias: [-6.32133891]
Epoch: 1000, Cost: 0.1604392693330146, Weights: [4.27548114 4.27284863], Bias: [-6.6027234]
  • 반복이 진행될 때마다 손실함수인 Cost가 점점 떨저짐
  • Weight값과 Bias값의 조정 과정도 살펴볼 수 있음
# AND 게이트 테스트
print(AND.predict(X))

# 출력 결과
[[0.00135483]
 [0.08867878]
 [0.08889176]
 [0.87496677]]
  • X값에 대해 실제 Y값은 0, 0, 0, 1임
  • AND 게이트 테스트 결과가 각각 1일 확률이고 마지막만 0.87로 높아 1로 분류, 나머지는 0.1 이하로 낮아 0으로 분류됨

 

  - OR 게이트

OR = LogicGateNet()
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y_2 = np.array([[0], [1], [1], [1]])

train_loss_list = list()

for i in range(epochs):
    grad_W, grad_B = OR.get_gradient(X, Y_2)

    OR.weights -= lr * grad_W
    OR.bias -= lr * grad_B

    loss = OR.loss(X, Y_2)
    train_loss_list.append(loss)

    if i%100 == 99:
        print("Epoch: {}, Cost: {}, Weights: {}, Bias: {}".format(i+1, loss, OR.weights, OR.bias))

# 출력 결과
Epoch: 100, Cost: 0.49580923848195635, Weights: [2.45484353 1.40566594], Bias: [-0.14439625]
Epoch: 200, Cost: 0.3398674231515118, Weights: [2.98631846 2.39448393], Bias: [-0.67661178]
Epoch: 300, Cost: 0.2573360986187996, Weights: [3.45016595 3.08431266], Bias: [-1.03721585]
Epoch: 400, Cost: 0.20630142190075948, Weights: [3.85230067 3.60865952], Bias: [-1.30598633]
Epoch: 500, Cost: 0.1716549922113493, Weights: [4.20195872 4.03000824], Bias: [-1.52060015]
Epoch: 600, Cost: 0.1466501884550824, Weights: [4.50867681 4.38171478], Bias: [-1.6994397]
Epoch: 700, Cost: 0.12779768649454676, Weights: [4.78049264 4.68334611], Bias: [-1.8527641]
Epoch: 800, Cost: 0.11310517185413338, Weights: [5.0237707 4.9472786], Bias: [-1.98691756]
Epoch: 900, Cost: 0.10135180918376233, Weights: [5.24347159 5.18181684], Bias: [-2.10611973]
Epoch: 1000, Cost: 0.09174843008614178, Weights: [5.44346811 5.39279833], Bias: [-2.21332947]
# OR 게이트 테스트
print(OR.predict(X))

# 출력 결과
[[0.09855987]
 [0.9600543 ]
 [0.96195283]
 [0.9998201 ]]

 

 

  - NAND 게이트

NAND = LogicGateNet()

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y_3 = np.array([[1], [1], [1], [0]])

train_loss_list = list()

for i in range(epochs):
    grad_W, grad_B = NAND.get_gradient(X, Y_3)

    NAND.weights -= lr * grad_W
    NAND.bias -= lr * grad_B

    loss = NAND.loss(X, Y_3)
    train_loss_list.append(loss)

    if i%100 == 99:
        print("Epoch: {}, Cost: {}, Weights: {}, Bias: {}".format(i+1, loss, NAND.weights, NAND.bias))

# 출력 결과
Epoch: 100, Cost: 0.7911738653769252, Weights: [-0.48972722 -1.25798774], Bias: [1.74566135]
Epoch: 200, Cost: 0.5430490957885361, Weights: [-1.51545093 -1.80261804], Bias: [2.79151756]
Epoch: 300, Cost: 0.4212591302740578, Weights: [-2.14614496 -2.26642639], Bias: [3.56506179]
Epoch: 400, Cost: 0.3456117101527486, Weights: [-2.607325   -2.66303355], Bias: [4.18521187]
Epoch: 500, Cost: 0.2931298605179329, Weights: [-2.97696333 -3.00501941], Bias: [4.70528682]
Epoch: 600, Cost: 0.2543396786002071, Weights: [-3.28850585 -3.30365261], Bias: [5.1539571]
Epoch: 700, Cost: 0.22443918596775067, Weights: [-3.55912171 -3.56778782], Bias: [5.54869527]
Epoch: 800, Cost: 0.20067626330853877, Weights: [-3.7989077  -3.80411461], Bias: [5.90108417]
Epoch: 900, Cost: 0.18134125517637367, Weights: [-4.01441395 -4.01767547], Bias: [6.21926514]
Epoch: 1000, Cost: 0.1653094408173465, Weights: [-4.21019696 -4.21231432], Bias: [6.50920952]
# NAND 게이트 테스트
print(NAND.predict(X))

# 출력 결과
[[0.99851256]
 [0.90861957]
 [0.90879523]
 [0.12861037]]

 

 

  - XOR 게이트

XOR = LogicGateNet()
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y_4 = np.array([[0], [1], [1], [0]])

train_loss_list = list()

for i in range(epochs):
    grad_W, grad_B = XOR.get_gradient(X, Y_4)

    XOR.weights -= lr * grad_W
    XOR.bias -= lr * grad_B

    loss = XOR.loss(X, Y_4)
    train_loss_list.append(loss)

    if i%100 == 99:
        print("Epoch: {}, Cost: {}, Weights: {}, Bias: {}".format(i+1, loss, XOR.weights, XOR.bias))

# 출력 결과
Epoch: 100, Cost: 1.4026852245456056, Weights: [ 0.47012771 -0.19931523], Bias: [-0.16097708]
Epoch: 200, Cost: 1.3879445622848308, Weights: [ 0.1572739  -0.03387161], Bias: [-0.07321056]
Epoch: 300, Cost: 1.386492030048381, Weights: [0.05525161 0.00089673], Bias: [-0.03330094]
Epoch: 400, Cost: 1.3863236205351948, Weights: [0.02049628 0.00504503], Bias: [-0.01514784]
Epoch: 500, Cost: 1.3862994743646844, Weights: [0.0080051  0.00361297], Bias: [-0.00689034]
Epoch: 600, Cost: 1.3862953430687464, Weights: [0.00326661 0.00201812], Bias: [-0.00313421]
Epoch: 700, Cost: 1.3862945581495083, Weights: [0.00137938 0.00102449], Bias: [-0.00142566]
Epoch: 800, Cost: 1.38629440139037, Weights: [0.00059716 0.00049628], Bias: [-0.00064849]
Epoch: 900, Cost: 1.3862943694120307, Weights: [0.00026303 0.00023435], Bias: [-0.00029498]
Epoch: 1000, Cost: 1.386294362832352, Weights: [0.0001172  0.00010905], Bias: [-0.00013418]
  • Cost를 확인해보면 다른 게이트에 비해 높아 잘 학습이 안된 모습
# XOR 게이트 테스트
print(XOR.predict(X))

# 출력 결과
[[0.49996646]
 [0.49999372]
 [0.49999575]
 [0.50002302]]
  • 테스트 결과도 전부 0.5주변에 머물러 학습이 잘 되지 않음을 확인
  • 2층 신경망으로 구현해야함

 

  - 2층 신경망으로 XOR 게이트 구현(1)

  • 얕은 신경망, Shallow Neural Network
  • 두 논리 게이트(NAND, OR)를 통과하고 AND 게이트로 합쳐서 구현
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y_5 = np.array([[0], [1], [1], [0]])

s1 = NAND.predict(X)
s2 = OR.predict(X)
X_2 = np.array([s1, s2]).T.reshape(-1, 2)

print(AND.predict(X_2))

# 출력 결과
[[0.12870357]
 [0.79966936]
 [0.80108545]
 [0.14420781]]
  • 0, 1, 1, 0에 가깝게 나온 결과

 

  - 2층 신경망으로 XOR 게이트 구현(2)

  • 클래스로 구현
class XORNet():
    def __init__(self):
        np.random.seed(1)

        def weight_init():
            params = {}
            params['w_1'] = np.random.randn(2)
            params['b_1'] = np.random.rand(2)
            params['w_2'] = np.random.randn(2)
            params['b_2'] = np.random.rand(2)
            return params
        
        self.params = weight_init()

    def predict(self, x):
        W_1, W_2 = self.params['w_1'].reshape(-1, 1), self.params['w_2'].reshape(-1, 1)
        B_1, B_2 = self.params['b_1'], self.params['b_2']

        A1 = np.dot(x, W_1) + B_1
        Z1 = sigmoid(A1)
        A2 = np.dot(Z1, W_2) + B_2
        pred_y = sigmoid(A2)

        return pred_y
    
    def loss(self, x, true_y):
        pred_y = self.predict(x)
        return cross_entropy_error_for_bin(pred_y, true_y)
    
    def get_gradient(self, x, t):
        def loss_grad(grad):
            return self.loss(x, t)
        
        grads = {}
        grads['w_1'] = differential(loss_grad, self.params['w_1'])
        grads['b_1'] = differential(loss_grad, self.params['b_1'])
        grads['w_2'] = differential(loss_grad, self.params['w_2'])
        grads['b_2'] = differential(loss_grad, self.params['b_2'])

        return grads

# 하이퍼 파라미터 재조정
lr = 0.3
# 모델 생성 및 학습
XOR = XORNet()
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
Y_5 = np.array([[0], [1], [1], [0]])

train_loss_list = list()

for i in range(epochs):
    grads = XOR.get_gradient(X, Y_5)

    for key in ('w_1', 'b_1', 'w_2', 'b_2'):
        XOR.params[key] -= lr * grads[key]
    
    loss = XOR.loss(X, Y_5)
    train_loss_list.append(loss)

    if i % 100 == 99:
        print("Epoch: {}, Cost: {}".format(i+1, loss))

# 출력 결과
Epoch: 100, Cost: 2.583421249699167
Epoch: 200, Cost: 0.6522444536804384
Epoch: 300, Cost: 0.2505164706195344
Epoch: 400, Cost: 0.14964904919118582
Epoch: 500, Cost: 0.10570445867337958
Epoch: 600, Cost: 0.0814030439804046
Epoch: 700, Cost: 0.06606149912973946
Epoch: 800, Cost: 0.05552519160632019
Epoch: 900, Cost: 0.04785478827730652
Epoch: 1000, Cost: 0.042027122417916646
# XOR 게이트 테스트
print(XOR.predict(X))

# 출력 결과
[[0.00846377]
 [0.98354369]
 [0.99163498]
 [0.0084976]]

 

 

2. 다중 클래스 분류: MNIST Dataset

  • 배치 처리
    • 학습 데이터 전체를 한번에 진행하지 않고,
      일부 데이터(샘플)을 확률적으로 구해서 조금씩 나누어 진행
    • 확률적 경사 하강법(Stochastic Gradient Descent) 또는 미니 배치 학습법(mini-batch learning)이라고 부름

 

# 필요한 라이브러리
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
from tqdm.notebook import tqdm

# mnist 데이터
mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

# 출력 결과
(60000, 28, 28)
(60000,)
(10000, 28, 28)
(10000,)
# 데이터 확인
# x_trian의 첫번째 데이터
img = x_train[0]
print(img.shape)  # (28, 28)
plt.imshow(img, cmap = 'gray')
plt.show()

# y_train의 첫번째 데이터
y_train[0]   # 5

 

  • 데이터 전처리
# 평탄화 함수
def flatten_for_mnist(x):
    temp = np.zeros((x.shape[0], x[0].size))

    for idx, data in enumerate(x):
        temp[idx, :] = data.flatten()

    return temp

# 정규화(색 표현 정규화)
x_train, x_test = x_train / 255.0, x_test / 255.0

# 평탄화
x_train = flatten_for_mnist(x_train)
x_test = flatten_for_mnist(x_test)

print(x_train.shape)
print(x_test.shape)

y_train_ohe = tf.one_hot(y_train, depth = 10).numpy()
y_test_ohe = tf.one_hot(y_test, depth = 10).numpy()

print(y_train_ohe.shape)
print(y_test_ohe.shape)
print(x_train[0].max(), x_test[0].min())
print(y_train_ohe[0])

# 출력 결과
0.00392156862745098 0.0
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]

# 전처리 과정을 통해 전체적으로 값이 스케일링됨을 확인

 

  • 하이퍼 파라미터
# 하이퍼 파라미터
epochs = 2
lr = 0.1
batch_size = 100
train_size = x_train.shape[0]

 

  • 사용되는 함수들
# 사용되는 함수들
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def mean_squared_error(pred_y, true_y):
    return 0.5 * (np.sum((true_y - pred_y)**2))

def cross_entropy_error(pred_y, true_y):
    if true_y.ndim ==1:
        true_y = true_y.reshape(1, -1)
        pred_y = pred_y.reshape(1, -1)

    delta = 1e-7
    return -np.sum(true_y * np.log(pred_y + delta))

# 배치 사이즈로 각 값들을 나눠줘야 함
def cross_entropy_error_for_batch(pred_y, true_y):
    if true_y.ndim ==1:
        true_y = true_y.reshape(1, -1)
        pred_y = pred_y.reshape(1, -1)

    delta = 1e-7
    batch_size = pred_y.shape[0]
    return -np.sum(true_y * np.log(pred_y + delta)) / batch_size

# 이진 분류일때
def cross_entropy_error_for_bin(pred_y, true_y):
    return 0.5 * np.sum((-true_y * np.log(pred_y) - (1 - true_y) * np.log(1 - pred_y)))

def softmax(a):
    exp_a = np.exp(a)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a

    return y

def differential_1d(f, x):
    eps = 1e-5
    diff_value = np.zeros_like(x)

    for i in range(x.shape[0]):
        temp_val = x[i]

        x[i] = temp_val + eps
        f_h1 = f(x)

        x[i] = temp_val - eps
        f_h2 = f(x)

        diff_value[i] = (f_h1 - f_h2) / (2 * eps)
        x[i] = temp_val
    
    return diff_value

def differential_2d(f, X):
    if X.ndim == 1:
        return differential_1d(f, x)
    else:
        grad = np.zeros_like(X)

        for idx,  x in enumerate(X):
            grad[idx] = differential_1d(f, x)

        return grad

 

  • 다중분류 클래스 구현
class MyModel():
    def __init__(self):
        np.random.seed(1)

        def weight_init(input_nodes, hidden_nodes, output_units):
            np.random.seed(777)

            params = {}
            params['w_1'] = 0.01 * np.random.randn(input_nodes, hidden_nodes)
            params['b_1'] = np.zeros(hidden_nodes)
            params['w_2'] = 0.01 * np.random.randn(hidden_nodes, output_units)
            params['b_2'] = np.zeros(output_units)
            return params
        
        # 784는 x_train.shape[1], hidden은 임의의 64, output은 0~9까지의 숫자로 10개가 있으므로 10으로 지정
        self.params = weight_init(784, 64, 10)

    def predict(self, x):
        W_1, W_2 = self.params['w_1'], self.params['w_2']
        B_1, B_2 = self.params['b_1'], self.params['b_2']

        A1 = np.dot(x, W_1) + B_1
        Z1 = sigmoid(A1)
        A2 = np.dot(Z1, W_2) + B_2
        pred_y = softmax(A2)

        return pred_y
    
    def loss(self, x, true_y):
        pred_y = self.predict(x)
        return cross_entropy_error_for_bin(pred_y, true_y)
    
    def accuracy(self, x, true_y):
        pred_y = self.predict(x)
        y_argmax = np.argmax(pred_y, axis = 1)
        t_argmax = np.argmax(true_y, axis = 1)
        # 예측값과 실제값이 같은 값들의 합을 전체 수로 나눠 몇개 맞췄는지 비율 계산
        accuracy = np.sum(y_argmax == t_argmax) / float(x.shape[0])

        return accuracy
    
    def get_gradient(self, x, t):
        def loss_grad(grad):
            return self.loss(x, t)
        
        grads = {}
        grads['w_1'] = differential_2d(loss_grad, self.params['w_1'])
        grads['b_1'] = differential_2d(loss_grad, self.params['b_1'])
        grads['w_2'] = differential_2d(loss_grad, self.params['w_2'])
        grads['b_2'] = differential_2d(loss_grad, self.params['b_2'])

        return grads
# 모델 학습
model = MyModel()
train_loss_list = list()
train_acc_list = list()
test_acc_list = list()
iter_per_epoch = max(train_size / batch_size, 1)

start_time = time.time()
for i in tqdm(range(epochs)):

    batch_idx = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_idx]
    y_batch = y_train_ohe[batch_idx]

    grads = model.get_gradient(x_batch, y_batch)
    
    for key in grads.keys():
        model.params[key] -= lr * grads[key]
    
    loss = model.loss(x_batch, y_batch)
    train_loss_list.append(loss)

    train_accuracy = model.accuracy(x_train, y_train_ohe)
    test_accuracy = model.accuracy(x_test, y_test_ohe)
    train_acc_list.appned(train_accuracy)
    test_acc_list.append(test_accuracy)

    print("Epoch: {}, Cost: {}, Train Accuracy: {}, Test Accuracy: {}".format(i+1, loss, train_accuracy, test_accuracy))

end_time = time.time()

print("총 학습 소요시간: {:.3f}s".format(end_time - start_time))
  • 출력 결과

  • 학습이 잘 되지 않은 모습

 

3. 모델의 결과

  • 모델은 학습이 잘 될수도, 잘 안될 수도 있음
  • 만약 학습이 잘 안된다면,
    학습이 잘 되기 위해 어떤 조치를 취해야 하는가?
    • 다양한 학습 관련 기술이 존재

+ Recent posts