5-11、高斯核函数RBF
创新互联建站是创新、创意、研发型一体的综合型网站建设公司,自成立以来公司不断探索创新,始终坚持为客户提供满意周到的服务,在本地打下了良好的口碑,在过去的10年时间我们累计服务了上千家以及全国政企客户,如酒楼设计等企业单位,完善的项目管理流程,严格把控项目进度与质量监控加上过硬的技术实力获得客户的一致表扬。
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
x, y = datasets.make_moons(n_samples=1000, noise=0.25, random_state=2020) # 生成1000个数据样本
plt.figure()
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="r")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="g")
plt.title('散点图')
plt.show()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)
# 绘制边界曲线
def plot_decision_boundary(model, axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
)
x_new = np.c_[x0.ravel(), x1.ravel()]
y_pre = model.predict(x_new)
zz = y_pre.reshape(x0.shape)
# 设置颜色
cus = ListedColormap(["#BA55D3", "#FF69B4", "#FFE4C4"])
plt.contourf(x0, x1, zz, cmap=cus)
def RBFkernelSVC(gamma):#高斯核函数RBF
return Pipeline([
("std", StandardScaler()),
("svc", SVC(kernel="rbf", gamma=gamma))
])
sv = RBFkernelSVC(gamma=1)
sv.fit(x_train, y_train)
plot_decision_boundary(sv, axis=([-1.8, 2.5, -1.4, 1.8]))
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="r")
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="g")
plt.title('高斯核函数RBF')
plt.show()
# 打印出分数
print(sv.score(x_test, y_test))
d = datasets.load_iris()
x = d.data
y = d.target
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2020)
sv = RBFkernelSVC(gamma=10)
sv.fit(x_train, y_train)
# 打印出分数
print(sv.score(x_test, y_test))
径向基(RBF)神经网络python实现
1 from numpy import array, append, vstack, transpose, reshape, \
2 dot, true_divide, mean, exp, sqrt, log, \
3 loadtxt, savetxt, zeros, frombuffer
4 from numpy.linalg import norm, lstsq
5 from multiprocessing import Process, Array
6 from random import sample
7 from time import time
8 from sys import stdout
9 from ctypes import c_double
10 from h5py import File
11
12
13 def metrics(a, b):
14 return norm(a - b)
15
16
17 def gaussian (x, mu, sigma):
18 return exp(- metrics(mu, x)**2 / (2 * sigma**2))
21 def multiQuadric (x, mu, sigma):
22 return pow(metrics(mu,x)**2 + sigma**2, 0.5)
23
24
25 def invMultiQuadric (x, mu, sigma):
26 return pow(metrics(mu,x)**2 + sigma**2, -0.5)
27
28
29 def plateSpine (x,mu):
30 r = metrics(mu,x)
31 return (r**2) * log(r)
32
33
34 class Rbf:
35 def __init__(self, prefix = 'rbf', workers = 4, extra_neurons = 0, from_files = None):
36 self.prefix = prefix
37 self.workers = workers
38 self.extra_neurons = extra_neurons
39
40 # Import partial model
41 if from_files is not None:
42 w_handle = self.w_handle = File(from_files['w'], 'r')
43 mu_handle = self.mu_handle = File(from_files['mu'], 'r')
44 sigma_handle = self.sigma_handle = File(from_files['sigma'], 'r')
45
46 self.w = w_handle['w']
47 self.mu = mu_handle['mu']
48 self.sigmas = sigma_handle['sigmas']
49
50 self.neurons = self.sigmas.shape[0]
51
52 def _calculate_error(self, y):
53 self.error = mean(abs(self.os - y))
54 self.relative_error = true_divide(self.error, mean(y))
55
56 def _generate_mu(self, x):
57 n = self.n
58 extra_neurons = self.extra_neurons
59
60 # TODO: Make reusable
61 mu_clusters = loadtxt('clusters100.txt', delimiter='\t')
62
63 mu_indices = sample(range(n), extra_neurons)
64 mu_new = x[mu_indices, :]
65 mu = vstack((mu_clusters, mu_new))
66
67 return mu
68
69 def _calculate_sigmas(self):
70 neurons = self.neurons
71 mu = self.mu
72
73 sigmas = zeros((neurons, ))
74 for i in xrange(neurons):
75 dists = [0 for _ in xrange(neurons)]
76 for j in xrange(neurons):
77 if i != j:
78 dists[j] = metrics(mu[i], mu[j])
79 sigmas[i] = mean(dists)* 2
80 # max(dists) / sqrt(neurons * 2))
81 return sigmas
82
83 def _calculate_phi(self, x):
84 C = self.workers
85 neurons = self.neurons
86 mu = self.mu
87 sigmas = self.sigmas
88 phi = self.phi = None
89 n = self.n
90
91
92 def heavy_lifting(c, phi):
93 s = jobs[c][1] - jobs[c][0]
94 for k, i in enumerate(xrange(jobs[c][0], jobs[c][1])):
95 for j in xrange(neurons):
96 # phi[i, j] = metrics(x[i,:], mu[j])**3)
97 # phi[i, j] = plateSpine(x[i,:], mu[j]))
98 # phi[i, j] = invMultiQuadric(x[i,:], mu[j], sigmas[j]))
99 phi[i, j] = multiQuadric(x[i,:], mu[j], sigmas[j])
100 # phi[i, j] = gaussian(x[i,:], mu[j], sigmas[j]))
101 if k % 1000 == 0:
102 percent = true_divide(k, s)*100
103 print(c, ': {:2.2f}%'.format(percent))
104 print(c, ': Done')
105
106 # distributing the work between 4 workers
107 shared_array = Array(c_double, n * neurons)
108 phi = frombuffer(shared_array.get_obj())
109 phi = phi.reshape((n, neurons))
110
111 jobs = []
112 workers = []
113
114 p = n / C
115 m = n % C
116 for c in range(C):
117 jobs.append((c*p, (c+1)*p + (m if c == C-1 else 0)))
118 worker = Process(target = heavy_lifting, args = (c, phi))
119 workers.append(worker)
120 worker.start()
121
122 for worker in workers:
123 worker.join()
124
125 return phi
126
127 def _do_algebra(self, y):
128 phi = self.phi
129
130 w = lstsq(phi, y)[0]
131 os = dot(w, transpose(phi))
132 return w, os
133 # Saving to HDF5
134 os_h5 = os_handle.create_dataset('os', data = os)
135
136 def train(self, x, y):
137 self.n = x.shape[0]
138
139 ## Initialize HDF5 caches
140 prefix = self.prefix
141 postfix = str(self.n) + '-' + str(self.extra_neurons) + '.hdf5'
142 name_template = prefix + '-{}-' + postfix
143 phi_handle = self.phi_handle = File(name_template.format('phi'), 'w')
144 os_handle = self.w_handle = File(name_template.format('os'), 'w')
145 w_handle = self.w_handle = File(name_template.format('w'), 'w')
146 mu_handle = self.mu_handle = File(name_template.format('mu'), 'w')
147 sigma_handle = self.sigma_handle = File(name_template.format('sigma'), 'w')
148
149 ## Mu generation
150 mu = self.mu = self._generate_mu(x)
151 self.neurons = mu.shape[0]
152 print('({} neurons)'.format(self.neurons))
153 # Save to HDF5
154 mu_h5 = mu_handle.create_dataset('mu', data = mu)
155
156 ## Sigma calculation
157 print('Calculating Sigma...')
158 sigmas = self.sigmas = self._calculate_sigmas()
159 # Save to HDF5
160 sigmas_h5 = sigma_handle.create_dataset('sigmas', data = sigmas)
161 print('Done')
162
163 ## Phi calculation
164 print('Calculating Phi...')
165 phi = self.phi = self._calculate_phi(x)
166 print('Done')
167 # Saving to HDF5
168 print('Serializing...')
169 phi_h5 = phi_handle.create_dataset('phi', data = phi)
170 del phi
171 self.phi = phi_h5
172 print('Done')
173
174 ## Algebra
175 print('Doing final algebra...')
176 w, os = self.w, _ = self._do_algebra(y)
177 # Saving to HDF5
178 w_h5 = w_handle.create_dataset('w', data = w)
179 os_h5 = os_handle.create_dataset('os', data = os)
180
181 ## Calculate error
182 self._calculate_error(y)
183 print('Done')
184
185 def predict(self, test_data):
186 mu = self.mu = self.mu.value
187 sigmas = self.sigmas = self.sigmas.value
188 w = self.w = self.w.value
189
190 print('Calculating phi for test data...')
191 phi = self._calculate_phi(test_data)
192 os = dot(w, transpose(phi))
193 savetxt('iok3834.txt', os, delimiter='\n')
194 return os
195
196 @property
197 def summary(self):
198 return '\n'.join( \
199 ['-----------------',
200 'Training set size: {}'.format(self.n),
201 'Hidden layer size: {}'.format(self.neurons),
202 '-----------------',
203 'Absolute error : {:02.2f}'.format(self.error),
204 'Relative error : {:02.2f}%'.format(self.relative_error * 100)])
205
206
207 def predict(test_data):
208 mu = File('rbf-mu-212243-2400.hdf5', 'r')['mu'].value
209 sigmas = File('rbf-sigma-212243-2400.hdf5', 'r')['sigmas'].value
210 w = File('rbf-w-212243-2400.hdf5', 'r')['w'].value
211
212 n = test_data.shape[0]
213 neur = mu.shape[0]
214
215 mu = transpose(mu)
216 mu.reshape((n, neur))
217
218 phi = zeros((n, neur))
219 for i in range(n):
220 for j in range(neur):
221 phi[i, j] = multiQuadric(test_data[i,:], mu[j], sigmas[j])
222
223 os = dot(w, transpose(phi))
224 savetxt('iok3834.txt', os, delimiter='\n')
225 return os
preface:做着最近的任务,对数据处理,做些简单的提特征,用机器学习算法跑下程序得出结果,看看哪些特征的组合较好,这一系列流程必然要用到很多函数,故将自己常用函数记录上。应该说这些函数基本上都会用到,像是数据预处理,处理完了后特征提取、降维、训练预测、通过混淆矩阵看分类效果,得出报告。
1.输入
从数据集开始,提取特征转化为有标签的数据集,转为向量。拆分成训练集和测试集,这里不多讲,在上一篇博客中谈到用StratifiedKFold()函数即可。在训练集中有data和target开始。
2.处理
[python] view plain copy
def my_preprocessing(train_data):
from sklearn import preprocessing
X_normalized = preprocessing.normalize(train_data ,norm = "l2",axis=0)#使用l2范式,对特征列进行正则
return X_normalized
def my_feature_selection(data, target):
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data_new = SelectKBest(chi2, k= 50).fit_transform(data,target)
return data_new
def my_PCA(data):#data without target, just train data, withou train target.
from sklearn import decomposition
pca_sklearn = decomposition.PCA()
pca_sklearn.fit(data)
main_var = pca_sklearn.explained_variance_
print sum(main_var)*0.9
import matplotlib.pyplot as plt
n = 15
plt.plot(main_var[:n])
plt.show()
def clf_train(data,target):
from sklearn import svm
#from sklearn.linear_model import LogisticRegression
clf = svm.SVC(C=100,kernel="rbf",gamma=0.001)
clf.fit(data,target)
#clf_LR = LogisticRegression()
#clf_LR.fit(x_train, y_train)
#y_pred_LR = clf_LR.predict(x_test)
return clf
def my_confusion_matrix(y_true, y_pred):
from sklearn.metrics import confusion_matrix
labels = list(set(y_true))
conf_mat = confusion_matrix(y_true, y_pred, labels = labels)
print "confusion_matrix(left labels: y_true, up labels: y_pred):"
print "labels\t",
for i in range(len(labels)):
print labels[i],"\t",
for i in range(len(conf_mat)):
print i,"\t",
for j in range(len(conf_mat[i])):
print conf_mat[i][j],'\t',
def my_classification_report(y_true, y_pred):
from sklearn.metrics import classification_report
print "classification_report(left: labels):"
print classification_report(y_true, y_pred)
my_preprocess()函数:
主要使用sklearn的preprocessing函数中的normalize()函数,默认参数为l2范式,对特征列进行正则处理。即每一个样例,处理标签,每行的平方和为1.
my_feature_selection()函数:
使用sklearn的feature_selection函数中SelectKBest()函数和chi2()函数,若是用词袋提取了很多维的稀疏特征,有必要使用卡方选取前k个有效的特征。
my_PCA()函数:
主要用来观察前多少个特征是主要特征,并且画图。看看前多少个特征占据主要部分。
clf_train()函数:
可用多种机器学习算法,如SVM, LR, RF, GBDT等等很多,其中像SVM需要调参数的,有专门调试参数的函数如StratifiedKFold()(见前几篇博客)。以达到最优。
my_confusion_matrix()函数:
主要是针对预测出来的结果,和原来的结果对比,算出混淆矩阵,不必自己计算。其对每个类别的混淆矩阵都计算出来了,并且labels参数默认是排序了的。
my_classification_report()函数:
主要通过sklearn.metrics函数中的classification_report()函数,针对每个类别给出详细的准确率、召回率和F-值这三个参数和宏平均值,用来评价算法好坏。另外ROC曲线的话,需要是对二分类才可以。多类别似乎不行。
主要参考sklearn官网
5-2、利用RBF作为核函数
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
iris = datasets.load_iris()
# 为简单起见,选取前两个特征作为分类的输入特征,
# 以便在二维空间画出决策曲线
X = iris.data[:, :2]
y = iris.target
# 设置分类器SVC,核函数为rbf,gamma设置为自动调整
svc = svm.SVC(kernel="rbf", C=1, gamma="auto").fit(X, y)
# 绘图参数
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
h = (x_max / x_min) / 100
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
plt.subplot(1, 1, 1)
# 利用已有分类器进行预测
Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# 绘制等高线并填充轮廓
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Paired)
plt.xlabel('花萼长度')
plt.ylabel('花萼宽度')
# 限制x的取值范围,便于显示
plt.xlim(xx.min(), xx.max())
plt.title('利用RBF作为核函数')
plt.show()