본문 바로가기

머신러닝

파이썬 Gradient Boosting 바닥부터 구현하기

기말고사 공부 할 겸 Gradient Boosting을 바닥부터 구현해 보았다.

나는 이해하지 못하면 암기를 못하는 편이다...

찝찝함을 없애기 위해 한번 파이썬으로 Gradient Boosting을 구현해 보았다.

하악...

 

 

코드 >>

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
 
def probability(log_odds: float) -> np.array:
    return np.exp(log_odds) / (np.exp(log_odds) + 1)
 
class GradientBoostingClassifier:
    def __init__(self,
                 X: np.array,
                 y: np.array,
                 n_boost: int = 100,
                 learning_rate: float = 0.1,
                 dt_max_depth: int = 1,
                 dt_max_leaf_nodes: int= 3):
        
        y_bin = np.bincount(y)
        y_most = np.max(y_bin)
        y_less = np.min(y_bin)
        
        self.X = X
        self.ans = y
 
        self.n_samples = y.shape[0]
        self.init_logit = np.log(y_most / y_less)
        self.init_prob = np.zeros([self.n_samples])
        self.init_prob[y == 0= 1 - probability(self.init_logit)
        self.init_prob[y == 1= probability(self.init_logit)
        self.init_loss = -(self.ans*np.log(self.init_prob) +\
                           (1-self.ans)*np.log(1-self.init_prob)).sum()
        self.n_boost = n_boost
        self.learning_rate = learning_rate
        self.dt_max_depth = dt_max_depth
        self.dt_max_leaf_nodes = dt_max_leaf_nodes
        self.logits = np.zeros([self.n_boost, self.n_samples])
        self.residuals = np.zeros([self.n_boost, self.n_samples])
        self.loss = np.zeros([self.n_boost])
        self.probs = np.zeros([self.n_boost, self.n_samples])
        self.preds = np.zeros([self.n_boost, self.n_samples])
        self.scores = np.zeros([self.n_boost])
        self.gamma = np.zeros([self.n_boost, self.n_samples])
        self.gamma_values = np.zeros([self.n_boost, self.n_samples])
        self.decision_trees: DecisionTreeRegressor = []
 
        self.logits[0= [self.init_logit] * self.n_samples
        self.probs[0= self.init_prob
        self.loss[0= self.init_loss
 
    def fit(self-> None:
 
        for i in range(0self.n_boost-1):
            self.residuals[i] = self.ans - self.probs[i]
            dt = DecisionTreeRegressor(max_depth = self.dt_max_depth)
            dt = dt.fit(self.X, self.ans)
            self.decision_trees.append(dt)
            leaf_indices = dt.apply(self.X)
            unique_leaves = np.unique(leaf_indices)
            n_leaf = len(unique_leaves)
            
            for ileaf in range(n_leaf):
                leaf_index = unique_leaves[ileaf]
                n_leaf = len(leaf_indices[leaf_indices == leaf_index])
                prev_prob = self.probs[i][leaf_indices == leaf_index]
                denominator = np.sum(prev_prob * (1 - prev_prob))
                nominator = np.sum(self.residuals[i][leaf_indices == leaf_index])
                self.gamma_values[i][ileaf] = nominator / (denominator)
            
 
            self.gamma[i] = [self.gamma_values[i][np.where(unique_leaves == index)] for index in leaf_indices]
            self.logits[i+1= self.logits[i] + self.learning_rate * self.gamma[i]
            self.probs[i+1= np.array([probability(logit) for logit in self.logits[i+1]])
            self.preds[i+1= (self.probs[i+1> 0.5* 1.0
            self.scores[i+1= np.sum(self.preds[i+1]==self.ans) / self.n_samples
            self.loss[i+1= np.sum(-self.ans * self.logits[i+1+\
                                    np.log(1+np.exp(self.logits[i+1])))
            
    def predict(self, X: np.array) -> np.array:
        init_logit = [self.init_logit] * len(X)
        for i in range(0self.n_boost-1):
            leaf_indices = self.decision_trees[i].apply(X)
            unique_leaves = np.unique(leaf_indices)
            n_leaf = unique_leaves.shape[0]
            gamma = np.ravel(np.array([self.gamma_values[i][np.where(unique_leaves == index)] for index in leaf_indices]))
            if i == 0:
                logits = init_logit + gamma * self.learning_rate
            else:
                logits += gamma * self.learning_rate
        
        probs = np.array([probability(logit) for logit in logits])
        preds = (probs >= 0.5* 1.0
        return preds
 
 
 
#######################################
############## 테스트 #################
#######################################
data = load_breast_cancer()
 
= data.data
= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)
 
gbc1 = GradientBoostingClassifier(X_train, y_train, n_boost=100, learning_rate=0.1, dt_max_depth = 1)
gbc1.fit()
preds = gbc1.predict(X_test)
scores = np.mean(preds==y_test)
print(scores)