【Python】使用BIRCH算法对KDDCUP99数据集的聚类结果进行可视化/计算香农熵

neunms    2020-06-06 20:27

1. 背景:在写DoS论文时,需要计算基于BIRCH算法聚类实验生成的数据子集的香农熵。根据香农熵的差异来表示不同的数据子集和原始数据之间是存在差异的,同时原始数据集的较大的香农熵也说明该数据集具有相对复杂的数据分布规律,间接证明使用BIRCH算法对数据进行预分类的操作是有意义的。
同时需要将数据子集的数据可视化以展现聚类算法的有效性。
2. 计算香农熵:代码中的数据集是10%的KDDCUP99数据集,该数据集经过归一化和独热编码的处理,其中仅包括Normal数据和DoS数据。香农熵的计算 公式如下:
        根据香农熵的定义可以知道:
这里写图片描述
        实验结果显示不同数据子集和未进行聚类的数据集的香农熵有很大差距:
import numpy as np
import pandas as pd
import math
from collections import Counter
class InformationGain():
    def __init__(self, feature, label):
        feature = np.array(feature)
        num_of_label = len(label)
        temp_ent = 0
        shanno_ent = []
        Counter(label)
        for i in set(label):
            # temp_ent += -(label[i].count() / num_of_label) * math.log(label[i].count() / num_of_label)
            temp_ent += -(sum(label == i) / num_of_label) * math.log(sum(label == i) / num_of_label)
        shanno_ent.append(temp_ent)
        self.shannoEnt = shanno_ent[0]

    def getEnt(self):
        return self.shannoEnt

def read_dataset(fname=u"../datasets/one_hot_kdd.csv"):
    data = pd.read_csv(fname, encoding="utf-8", header=None)
    data = data.fillna(0)
    return data

if __name__ == '__main__':
    train = read_dataset()
    print(train)
    X, y = train, train.pop(38).values
    print(InformationGain(X, y).getEnt())
3.数据的可视化:由于原始数据集是42维,经过数据预处理之后的维度是178维,因此采用t-SNE算法将数据降维到2维便于可视化,由于数据量太大,仅选取10000行数据进行可视化。
# coding='utf-8'

from time import time
import pandas as pd
from sklearn.manifold import TSNE
def get_data(fname=u"../datasets/one_hot_kdd.csv"):
    data = pd.read_csv(fname, encoding="utf-8", header=None, nrows=10000)
    data = data.fillna(0)
    X, y = data, data.pop(38).values
    n_samples, n_features = X.shape
    return X, y, n_samples, n_features
def main():
    data, label, n_samples, n_features = get_data()
    print('Computing t-SNE embedding')
    tsne = TSNE(n_components=2, random_state=0)
    result = tsne.fit_transform(data)
    df = pd.DataFrame(result)
    return df
if __name__ == '__main__':
    df = main()
    df.to_csv('../datasets/one_hot_kdd_tsne.csv', sep=',', index=False, header=False)
降维之后将原始数据集和降维之后的数据集同时放到画图模块中,提取原始数据集中的数据和标签进行BIRCH算法的聚类,得到聚类结果标签序列;提取降维后数据集的数据和聚类结果标签序列构成一个新的数据集用来对不同聚类中的散点进行可视化:
# coding='utf-8'
from sklearn.cluster import Birch
from time import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import datasets
from sklearn.manifold import TSNE

# 用来正常显示中文标签
plt.rcParams['font.sans-serif'] = ['SimHei']
# 用来正常显示负号
plt.rcParams['axes.unicode_minus'] = False


def get_tsne_data(dataname=u"../datasets/one_hot_kdd_tsne.csv"):
    data_tsen = pd.read_csv(dataname, encoding="utf-8", header=None, nrows=10000)
    return data_tsen


def get_data(dataname=u"../datasets/one_hot_kdd.csv"):
    data_label = pd.read_csv(dataname, encoding="utf-8", header=None, nrows=10000)
    dataset, label = data_label, data_label.pop(38)
    return dataset, label


def birch(data, tsnedata):
    X = data
    birch = Birch(n_clusters=4, threshold=0.6)
    ##训练数据
    labels = birch.fit_predict(X)
    from collections import Counter
    print(Counter(labels))

    colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868']

    fig = plt.figure(figsize=(12, 7))
    ax = fig.add_subplot(111)
    print(tsnedata[labels == 1][1])
    x_min, x_max = np.min(tsnedata, 0), np.max(tsnedata, 0)
    tsnedata = (tsnedata - x_min) / (x_max - x_min)
    tsnedata[labels == 0].plot(x=0, y=1, kind="scatter", label="聚类标签 0", color=colors[0],
                               fontsize=12, ax=ax,
                               )
    tsnedata[labels == 1].plot(x=0, y=1, kind="scatter", label="聚类标签 1", color=colors[1],
                               fontsize=12, ax=ax,
                               )
    tsnedata[labels == 2].plot(x=0, y=1, kind="scatter", label="聚类标签 2", color=colors[2],
                               fontsize=12, ax=ax,
                               )
    tsnedata[labels == 3].plot(x=0, y=1, kind="scatter", label="聚类标签 3", color=colors[3],
                               fontsize=12, ax=ax,
                               )

    ax.set_xlabel(" ", fontsize=14)
    ax.set_ylabel(" ", fontsize=14)
    plt.savefig('聚类结果散点图.svg')


if __name__ == '__main__':
    dataset, label = get_data()
    tsne_datset = get_tsne_data()
    birch(dataset, tsne_datset)

 
Last Modified: 2020-06-07 12:39
Views: 2.8K

[[total]] comments

Post your comment
  1. [[item.time]]
    [[item.user.username]] [[item.floor]]Floor
  2. Click to load more...
  3. Post your comment