【词云】听故事的故事人

2019-01-16

import re
import time
import sqlite3
import jieba
import collections # 词频统计库
from pyecharts import Bar
from pyecharts import WordCloud
from pyecharts import Line

conn = sqlite3.connect('./music.db')
c = conn.cursor()
stopwords_list = ['吗', '的','啊',' ','是', '吧','了','在','就','有','呀','这', '很','要','…','给',]

word_list = []
# 文本预处理
pattern = re.compile(u'\t|\n|\.|-|:|;|\)|\(|\?|\，|\。|\！|\？|"|\]|\[|\、|\”|\“|\~|\～|\、') # 定义正则表达式匹配模式
print("Opened database successfully")
cursor = c.execute("SELECT music_name, comment_id, user_id, user_name, avatar_url, comment_time, liked_count, comment  from music2057578222")
for row in cursor:
    print('user_name:{}'.format(row[3]))
    print('comment_time:{}'.format(row[5]))
    print('liked_count:{}'.format(row[6]))
    print('comment:{}\n'.format(row[7]))
    seg_list = [i for i in jieba.cut(re.sub(pattern, '', row[7]))]# 将符合模式的字符去除
    print(seg_list)
    for seg in seg_list:
        if seg not in stopwords_list:
            word_list.append(seg)
        else:
            print(seg)

print("Operation done successfully")
conn.close()

# 词频统计
word_counts = collections.Counter(word_list) # 对分词做词频统计
word_counts_top100 = word_counts.most_common(100) # 获取前10最高频的词
print (word_counts_top100) # 输出检查

myWordCloud = WordCloud("评论词云",width=1000, height=620)
words = list(word_counts_top100.keys())
value = list(word_counts_top100.values())
myWordCloud.add("",words,value,shape="circle",word_size_range=[20,100])
myWordCloud.render()

comment_time.append(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(row[5]/1000)))
comment_time_day.append(time.strftime("%Y-%m-%d", time.localtime(row[5]/1000)))
comment_time_hour.append(time.strftime("%H", time.localtime(row[5]/1000)))

# 词频统计
day_counts = collections.Counter(comment_time_day) # 对分词做词频统计
day_counts = sorted(day_counts.items())
day_counts
line = Line("评论日期")
line.add("", [i[0] for i in day_counts], [i[1] for i in day_counts],is_label_show=True, mark_point=["average","max"])
line.render('line.html')
line

# 词频统计
hour_counts = collections.Counter(comment_time_hour) # 对分词做词频统计
hour_counts = sorted(hour_counts.items())
hour_counts
bar = Bar("评论时间")
bar.add("bar", [i[0] for i in hour_counts], [i[1] for i in hour_counts],is_label_show=True)
line = Line()
line.add("line",[i[0] for i in hour_counts], [i[1] for i in hour_counts])

overlap = Overlap()
overlap.add(bar)
overlap.add(line)
overlap
overlap.render('bar.html')

【词云】听故事的故事人

【词云】听故事的故事人

Recommend

396复习

树莓派更新失败

如何了解一个新行业

Securing the Open-source Software Supply Chain

机器视觉：All about Sequence Matching

A Year Without a Byte

Project RADAR: Intelligent Early Fraud Detection System with Humans in the Loop

Real-Time Exactly-Once Ad Event Processing with Apache Flink, Kafka, and Pinot

LEETCODE PATTERNS

Pinot Real-Time Ingestion with Cloud Segment Storage

About Joyk