同时需要将数据子集的数据可视化以展现聚类算法的有效性。
2. 计算香农熵:代码中的数据集是10%的KDDCUP99数据集,该数据集经过归一化和独热编码的处理,其中仅包括Normal数据和DoS数据。香农熵的计算 公式如下:
根据香农熵的定义可以知道: 实验结果显示不同数据子集和未进行聚类的数据集的香农熵有很大差距:
import numpy as np
import pandas as pd
import math
from collections import Counter
class InformationGain():
def __init__(self, feature, label):
feature = np.array(feature)
num_of_label = len(label)
temp_ent = 0
shanno_ent = []
Counter(label)
for i in set(label):
# temp_ent += -(label[i].count() / num_of_label) * math.log(label[i].count() / num_of_label)
temp_ent += -(sum(label == i) / num_of_label) * math.log(sum(label == i) / num_of_label)
shanno_ent.append(temp_ent)
self.shannoEnt = shanno_ent[0]
def getEnt(self):
return self.shannoEnt
def read_dataset(fname=u"../datasets/one_hot_kdd.csv"):
data = pd.read_csv(fname, encoding="utf-8", header=None)
data = data.fillna(0)
return data
if __name__ == '__main__':
train = read_dataset()
print(train)
X, y = train, train.pop(38).values
print(InformationGain(X, y).getEnt())
3.数据的可视化:由于原始数据集是42维,经过数据预处理之后的维度是178维,因此采用t-SNE算法将数据降维到2维便于可视化,由于数据量太大,仅选取10000行数据进行可视化。
# coding='utf-8'
from time import time
import pandas as pd
from sklearn.manifold import TSNE
def get_data(fname=u"../datasets/one_hot_kdd.csv"):
data = pd.read_csv(fname, encoding="utf-8", header=None, nrows=10000)
data = data.fillna(0)
X, y = data, data.pop(38).values
n_samples, n_features = X.shape
return X, y, n_samples, n_features
def main():
data, label, n_samples, n_features = get_data()
print('Computing t-SNE embedding')
tsne = TSNE(n_components=2, random_state=0)
result = tsne.fit_transform(data)
df = pd.DataFrame(result)
return df
if __name__ == '__main__':
df = main()
df.to_csv('../datasets/one_hot_kdd_tsne.csv', sep=',', index=False, header=False)
降维之后将原始数据集和降维之后的数据集同时放到画图模块中,提取原始数据集中的数据和标签进行BIRCH算法的聚类,得到聚类结果标签序列;提取降维后数据集的数据和聚类结果标签序列构成一个新的数据集用来对不同聚类中的散点进行可视化:
# coding='utf-8'
from sklearn.cluster import Birch
from time import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.manifold import TSNE
# 用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False
def get_tsne_data(dataname=u"../datasets/one_hot_kdd_tsne.csv"):
data_tsen = pd.read_csv(dataname, encoding="utf-8", header=None, nrows=10000)
return data_tsen
def get_data(dataname=u"../datasets/one_hot_kdd.csv"):
data_label = pd.read_csv(dataname, encoding="utf-8", header=None, nrows=10000)
dataset, label = data_label, data_label.pop(38)
return dataset, label
def birch(data, tsnedata):
X = data
birch = Birch(n_clusters=4, threshold=0.6)
##训练数据
labels = birch.fit_predict(X)
from collections import Counter
print(Counter(labels))
colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868']
fig = plt.figure(figsize=(12, 7))
ax = fig.add_subplot(111)
print(tsnedata[labels == 1][1])
x_min, x_max = np.min(tsnedata, 0), np.max(tsnedata, 0)
tsnedata = (tsnedata - x_min) / (x_max - x_min)
tsnedata[labels == 0].plot(x=0, y=1, kind="scatter", label="聚类标签 0", color=colors[0],
fontsize=12, ax=ax,
)
tsnedata[labels == 1].plot(x=0, y=1, kind="scatter", label="聚类标签 1", color=colors[1],
fontsize=12, ax=ax,
)
tsnedata[labels == 2].plot(x=0, y=1, kind="scatter", label="聚类标签 2", color=colors[2],
fontsize=12, ax=ax,
)
tsnedata[labels == 3].plot(x=0, y=1, kind="scatter", label="聚类标签 3", color=colors[3],
fontsize=12, ax=ax,
)
ax.set_xlabel(" ", fontsize=14)
ax.set_ylabel(" ", fontsize=14)
plt.savefig('聚类结果散点图.svg')
if __name__ == '__main__':
dataset, label = get_data()
tsne_datset = get_tsne_data()
birch(dataset, tsne_datset)