记录一次微博数据的处理

hxy    2019-07-26 11:27

场景:现有Mysql数据库中爬取的微博数据集,如:https://www.neusncp.com/user/blog?id=91 
已转存为csv格式,想存储为gml格式使用Gephi进行可视化。(受服务器资源限制,NEUSNCP的DATALAB还不足以可视化这么大的数据集
话不多说,直接上代码:
import networkx as nx
import csv


def progress(percent, width=50):
    '''进度打印功能'''
    percent = 100 if percent>=100 else percent
    end_str = '\n' if percent >= 100 else ''
    show_str=('[%%-%ds]' %width) %(int(width * percent/100)*"#") #字符串拼接的嵌套使用
    print('\r%s %d%%' %(show_str,percent),end=end_str)


def read_from_csv(path_node, path_edge, keys):
    ''' 读取用户关系数据 '''
    G = nx.DiGraph()
    # 读取总行数,方便打印进度
    total_nodes = 1
    total_edges = 1
    with open(path_node, encoding='utf-8') as f:
        total_nodes = len(f.readlines())
        print('节点总计:', total_nodes)
    with open(path_edge, encoding='utf-8') as f:
        total_edges = len(f.readlines())
        print('关系总计:', total_edges)

    # 读取点
    print('\n读取节点', path_node)
    with open(path_node, encoding='utf-8') as f:
        data = csv.reader(f, delimiter=',')
        for i,d in enumerate(data):
            G.add_node(d[0])
            # 读取节点的属性
            for j,k in enumerate(keys):
                G.nodes[d[0]][k] = d[j]
            # 打印进度
            percent = (i + 1) * 100 / total_nodes
            progress(percent)
    # 读取边
    print('\n读取边', path_edge)
    with open(path_edge, encoding='utf-8') as f:
        data = csv.reader(f, delimiter=',')
        for i,d in enumerate(data):
            G.add_edge(d[0],d[1])
            # 打印进度条
            percent = (i + 1) * 100 / total_edges
            progress(percent)
    return G


def get_sina_sql_graph(keys):
    '''从微博数据库中随机读取长度为size的用户信息,构建网络G
    '''
    # 引入必要的包
    import pymysql
    import networkx as nx
    import random
    import datetime
    
    # 本地Mysql配置,读取数据库
    conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='password',db='weibo')
    cur = conn.cursor()
    G = nx.DiGraph()

    # 获取关注关系数据集总条数
    cur.execute("SELECT COUNT(*) FROM weibo")
    total_nodes = cur.fetchone()[0]
    print('节点总计:', total_nodes)

    # 获取关注关系数据集总条数
    cur.execute("SELECT COUNT(*) FROM weiborelation")
    total_edges = cur.fetchone()[0]
    print('关系总计:', total_edges)

    # 读取微博
    cur.execute("SELECT * FROM weibo")
    for i,r in enumerate(cur.fetchall()):
        G.add_node(r[0])
        for j,k in enumerate(keys):
            G.nodes[r[0]][k] = r[j] if not isinstance(r[j], datetime.date) else r[j].strftime('%Y-%m-%d %H:%M:%S')
        # 打印进度
        percent = (i + 1) * 100 / total_nodes
        progress(percent)

    # 读取微博的转发关系,构建有向网络
    cur.execute("SELECT * FROM weiborelation")
    for i,r in enumerate(cur.fetchall()):
        G.add_edge(r[0], r[1])
        # 打印进度条
        percent = (i + 1) * 100 / total_edges
        progress(percent)
    # 关闭数据库连接
    cur.close()
    conn.close()
    return G


if __name__ == '__main__':
    # 途径1:通过本地的csv文件读取数据,并存储为gml文件
    # 根据用户属性,构建字典,作为节点的属性
    keys = ['uid','username','name','province','city','location','url','gender', 
            'followersnum','friendsnum','statusnum','favouritemsnum','created']
    path_user = './Weibo-user-relations/user.csv'
    path_user_relation = './Weibo-user-relations/userrelation.csv'
    G1 = read_from_csv(path_user, path_user_relation, keys)
    nx.write_gml(G1, './user_relations.gml')

    # 途径2:通过mysql数据库读取数据,并存为gml文件
    # 根据用户属性,构建字典,作为节点的属性
    keys = ['mid','data','text','source','repostsnum','commentsnum',
            'attuidesnum','uid', 'topic']
    G2 = get_sina_sql_graph(keys)
    nx.write_gml(G2, './weibo_relations.gml')
但是遇到这个样的问题:
 
原来是write_gml中,节点的属性值 中不能包含 “_”
Last Modified: 2019-07-27 08:05
Views: 1.8K

[[total]] comments

Post your comment
  1. [[item.time]]
    [[item.user.username]] [[item.floor]]Floor
  2. Click to load more...
  3. Post your comment