已转存为csv格式,想存储为gml格式使用Gephi进行可视化。(受服务器资源限制,NEUSNCP的DATALAB还不足以可视化这么大的数据集)
话不多说,直接上代码:
import networkx as nx
import csv
def progress(percent, width=50):
'''进度打印功能'''
percent = 100 if percent>=100 else percent
end_str = '\n' if percent >= 100 else ''
show_str=('[%%-%ds]' %width) %(int(width * percent/100)*"#") #字符串拼接的嵌套使用
print('\r%s %d%%' %(show_str,percent),end=end_str)
def read_from_csv(path_node, path_edge, keys):
''' 读取用户关系数据 '''
G = nx.DiGraph()
# 读取总行数,方便打印进度
total_nodes = 1
total_edges = 1
with open(path_node, encoding='utf-8') as f:
total_nodes = len(f.readlines())
print('节点总计:', total_nodes)
with open(path_edge, encoding='utf-8') as f:
total_edges = len(f.readlines())
print('关系总计:', total_edges)
# 读取点
print('\n读取节点', path_node)
with open(path_node, encoding='utf-8') as f:
data = csv.reader(f, delimiter=',')
for i,d in enumerate(data):
G.add_node(d[0])
# 读取节点的属性
for j,k in enumerate(keys):
G.nodes[d[0]][k] = d[j]
# 打印进度
percent = (i + 1) * 100 / total_nodes
progress(percent)
# 读取边
print('\n读取边', path_edge)
with open(path_edge, encoding='utf-8') as f:
data = csv.reader(f, delimiter=',')
for i,d in enumerate(data):
G.add_edge(d[0],d[1])
# 打印进度条
percent = (i + 1) * 100 / total_edges
progress(percent)
return G
def get_sina_sql_graph(keys):
'''从微博数据库中随机读取长度为size的用户信息,构建网络G
'''
# 引入必要的包
import pymysql
import networkx as nx
import random
import datetime
# 本地Mysql配置,读取数据库
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='password',db='weibo')
cur = conn.cursor()
G = nx.DiGraph()
# 获取关注关系数据集总条数
cur.execute("SELECT COUNT(*) FROM weibo")
total_nodes = cur.fetchone()[0]
print('节点总计:', total_nodes)
# 获取关注关系数据集总条数
cur.execute("SELECT COUNT(*) FROM weiborelation")
total_edges = cur.fetchone()[0]
print('关系总计:', total_edges)
# 读取微博
cur.execute("SELECT * FROM weibo")
for i,r in enumerate(cur.fetchall()):
G.add_node(r[0])
for j,k in enumerate(keys):
G.nodes[r[0]][k] = r[j] if not isinstance(r[j], datetime.date) else r[j].strftime('%Y-%m-%d %H:%M:%S')
# 打印进度
percent = (i + 1) * 100 / total_nodes
progress(percent)
# 读取微博的转发关系,构建有向网络
cur.execute("SELECT * FROM weiborelation")
for i,r in enumerate(cur.fetchall()):
G.add_edge(r[0], r[1])
# 打印进度条
percent = (i + 1) * 100 / total_edges
progress(percent)
# 关闭数据库连接
cur.close()
conn.close()
return G
if __name__ == '__main__':
# 途径1:通过本地的csv文件读取数据,并存储为gml文件
# 根据用户属性,构建字典,作为节点的属性
keys = ['uid','username','name','province','city','location','url','gender',
'followersnum','friendsnum','statusnum','favouritemsnum','created']
path_user = './Weibo-user-relations/user.csv'
path_user_relation = './Weibo-user-relations/userrelation.csv'
G1 = read_from_csv(path_user, path_user_relation, keys)
nx.write_gml(G1, './user_relations.gml')
# 途径2:通过mysql数据库读取数据,并存为gml文件
# 根据用户属性,构建字典,作为节点的属性
keys = ['mid','data','text','source','repostsnum','commentsnum',
'attuidesnum','uid', 'topic']
G2 = get_sina_sql_graph(keys)
nx.write_gml(G2, './weibo_relations.gml')
但是遇到这个样的问题:原来是write_gml中,节点的属性值 中不能包含 “_”