这篇文章主要讲解了“python怎么实现AdaBoost算法”,文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习“python怎么实现AdaBoost算法”吧!
import numpy as np import pandas as pd import math from math import log from math import exp from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split def create_data(): iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['label'] = iris.target df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] data = np.array(df.iloc[:100, [0, 1, -1]]) for i in range(len(data)): if data[i, -1] == 0: data[i, -1] = -1 return data[:, :2], data[:, -1] class AdaBoost: def __init__(self, n_estimators=50, learning_rate=1.0): self.clf_num = n_estimators self.learning_rate = learning_rate def init_args(self, datasets, labels): self.X = datasets self.Y = labels self.M, self.N = datasets.shape # 弱分类器数目和集合 self.clf_sets = [] # 初始化weights self.weights = [1.0 / self.M] * self.M # G(x)系数 alpha self.alpha = [] def _G(self, features, labels, weights): m = len(features) error = 100000.0 # 无穷大 best_v = 0.0 # 单维features features_min = min(features) features_max = max(features) n_step = (features_max - features_min + self.learning_rate) // self.learning_rate # print('n_step:{}'.format(n_step)) direct, compare_array = None, None for i in range(1, int(n_step)): v = features_min + self.learning_rate * i if v not in features: # 误分类计算 compare_array_positive = np.array( [1 if features[k] > v else -1 for k in range(m)]) weight_error_positive = sum([ weights[k] for k in range(m) if compare_array_positive[k] != labels[k] ]) compare_array_nagetive = np.array( [-1 if features[k] > v else 1 for k in range(m)]) weight_error_nagetive = sum([ weights[k] for k in range(m) if compare_array_nagetive[k] != labels[k] ]) if weight_error_positive < weight_error_nagetive: weight_error = weight_error_positive _compare_array = compare_array_positive direct = 'positive' else: weight_error = weight_error_nagetive _compare_array = compare_array_nagetive direct = 'nagetive' # print('v:{} error:{}'.format(v, weight_error)) if weight_error < error: error = weight_error compare_array = _compare_array best_v = v return best_v, direct, error, compare_array # 计算alpha def _alpha(self, error): return 0.5 * np.log((1 - error) / error) # 规范化因子 def _Z(self, weights, a, clf): return sum([ weights[i] * np.exp(-1 * a * self.Y[i] * clf[i]) for i in range(self.M) ]) # 权值更新 def _w(self, a, clf, Z): for i in range(self.M): self.weights[i] = self.weights[i] * np.exp( -1 * a * self.Y[i] * clf[i]) / Z # G(x)的线性组合 def _f(self, alpha, clf_sets): pass def G(self, x, v, direct): if direct == 'positive': return 1 if x > v else -1 else: return -1 if x > v else 1 def fit(self, X, y): self.init_args(X, y) for epoch in range(self.clf_num): axis = 0 final_direct = 'null' best_clf_error, best_v, clf_result = 100000, None, None # 根据特征维度, 选择误差最小的 for j in range(self.N): features = self.X[:, j] # 分类阈值,分类误差,分类结果 v, direct, error, compare_array = self._G( features, self.Y, self.weights) if error < best_clf_error: best_clf_error = error best_v = v final_direct = direct clf_result = compare_array axis = j # axis数字代表第几个属性列 # print('epoch:{}/{} feature:{} error:{} v:{}'.format(epoch, self.clf_num, j, error, best_v)) if best_clf_error == 0: break # 计算G(x)系数a a = self._alpha(best_clf_error) self.alpha.append(a) # 记录分类器 self.clf_sets.append((axis, best_v, final_direct)) # 规范化因子 Z = self._Z(self.weights, a, clf_result) # 权值更新 self._w(a, clf_result, Z) def predict(self, feature): result = 0.0 for i in range(len(self.clf_sets)): axis, clf_v, direct = self.clf_sets[i] f_input = feature[axis] result += self.alpha[i] * self.G(f_input, clf_v, direct) # sign return 1 if result > 0 else -1 def score(self, X_test, y_test): right_count = 0 for i in range(len(X_test)): feature = X_test[i] if self.predict(feature) == y_test[i]: right_count += 1 return right_count / len(X_test) X, y = create_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = AdaBoost(n_estimators=3, learning_rate=0.5) clf.fit(X_train, y_train) print("评分:{}".format(clf.score(X_test, y_test)))
结果:有时1.0
有时0.75
有时0.6
有时0.4
注意,这个程序计算规范化因子的时候可能报错:TypeError: 'NoneType' object is not subscriptable
。原因是由于划分数据的时候,v选择的时候恰好造成了一边为空,另一边为满的。由于有一边是空的,所以,计算规范化因子的时候,参数clf为none。这时候我们在用clf[i],肯定是不行的,也就报了这个错误。
import numpy as np import pandas as pd import math from math import log from math import exp from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.ensemble import AdaBoostClassifier def create_data(): iris = load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) df['label'] = iris.target df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label'] data = np.array(df.iloc[:100, [0, 1, -1]]) for i in range(len(data)): if data[i, -1] == 0: data[i, -1] = -1 return data[:, :2], data[:, -1] X, y = create_data() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf = AdaBoostClassifier(n_estimators=100, learning_rate=0.5) clf.fit(X_train, y_train) print("评分:{}".format(clf.score(X_test, y_test)))
感谢各位的阅读,以上就是“python怎么实现AdaBoost算法”的内容了,经过本文的学习后,相信大家对python怎么实现AdaBoost算法这一问题有了更深刻的体会,具体使用情况还需要大家实践验证。这里是亿速云,小编将为大家推送更多相关知识点的文章,欢迎关注!
免责声明:本站发布的内容(图片、视频和文字)以原创、转载和分享为主,文章观点不代表本网站立场,如果涉及侵权请联系站长邮箱:is@yisu.com进行举报,并提供相关证据,一经查实,将立刻删除涉嫌侵权内容。