
4

【词云】听故事的故事人
source link: https://blog.feelyou.top/posts/1861734117.html
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

【词云】听故事的故事人
2019-01-16
11
import re
import time
import sqlite3
import jieba
import collections # 词频统计库
from pyecharts import Bar
from pyecharts import WordCloud
from pyecharts import Line
conn = sqlite3.connect('./music.db')
c = conn.cursor()
stopwords_list = ['吗', '的','啊',' ','是', '吧','了','在','就','有','呀','这', '很','要','…','给',]
word_list = []
# 文本预处理
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|\,|\。|\!|\?|"|\]|\[|\、|\”|\“|\~|\~|\、') # 定义正则表达式匹配模式
print("Opened database successfully")
cursor = c.execute("SELECT music_name, comment_id, user_id, user_name, avatar_url, comment_time, liked_count, comment from music2057578222")
for row in cursor:
print('user_name:{}'.format(row[3]))
print('comment_time:{}'.format(row[5]))
print('liked_count:{}'.format(row[6]))
print('comment:{}\n'.format(row[7]))
seg_list = [i for i in jieba.cut(re.sub(pattern, '', row[7]))]# 将符合模式的字符去除
print(seg_list)
for seg in seg_list:
if seg not in stopwords_list:
word_list.append(seg)
else:
print(seg)
print("Operation done successfully")
conn.close()
# 词频统计
word_counts = collections.Counter(word_list) # 对分词做词频统计
word_counts_top100 = word_counts.most_common(100) # 获取前10最高频的词
print (word_counts_top100) # 输出检查
myWordCloud = WordCloud("评论词云",width=1000, height=620)
words = list(word_counts_top100.keys())
value = list(word_counts_top100.values())
myWordCloud.add("",words,value,shape="circle",word_size_range=[20,100])
myWordCloud.render()
comment_time.append(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(row[5]/1000)))
comment_time_day.append(time.strftime("%Y-%m-%d", time.localtime(row[5]/1000)))
comment_time_hour.append(time.strftime("%H", time.localtime(row[5]/1000)))
# 词频统计
day_counts = collections.Counter(comment_time_day) # 对分词做词频统计
day_counts = sorted(day_counts.items())
day_counts
line = Line("评论日期")
line.add("", [i[0] for i in day_counts], [i[1] for i in day_counts],is_label_show=True, mark_point=["average","max"])
line.render('line.html')
line
# 词频统计
hour_counts = collections.Counter(comment_time_hour) # 对分词做词频统计
hour_counts = sorted(hour_counts.items())
hour_counts
bar = Bar("评论时间")
bar.add("bar", [i[0] for i in hour_counts], [i[1] for i in hour_counts],is_label_show=True)
line = Line()
line.add("line",[i[0] for i in hour_counts], [i[1] for i in hour_counts])
overlap = Overlap()
overlap.add(bar)
overlap.add(line)
overlap
overlap.render('bar.html')
Recommend
About Joyk
Aggregate valuable and interesting links.
Joyk means Joy of geeK