新闻热词:2017年的top20高频词

1 分钟读完

数据集

2017年新闻热词数据

数据读取

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties  
dataPath="../data/hotquery.news.csv"
COLUMNS = ["query", "source", "category", "date"]
dataset = pd.read_csv(tf.gfile.Open(dataPath),sep="\t",encoding='utf8',header=None,names=COLUMNS)

数据分析

dataset.head(5)
query source category date
0 找小姐拒付嫖资 baidu society 2017-02-25 14:45:42.0
1 印度女星遭轮奸 baidu ent 2017-02-25 14:45:42.0
2 高校招生30禁令 baidu society 2017-02-25 14:45:42.0
3 王菲谢霆锋热吻 baidu ent 2017-02-25 14:45:43.0
4 南非爆发排外游行 baidu society 2017-02-25 14:45:43.0
dataset.shape
(16817, 4)
dataset.groupby('source').size()
source
360         567
baidu      8517
shenma     1838
sogou      3379
unknown    2243
weixin      273
dtype: int64
dataset.groupby('category').size()
category
auto         37
china       279
ent        1907
finance     121
mil          61
society    5887
sports      101
tech         65
unknown    8181
world       178
dtype: int64

对热词进行分词

import jieba
class Segment:
    def cut(self, words):
        seg_list = jieba.cut(words, cut_all=False)
        return seg_list
import re
segment = Segment()
def clean(string):
    chinese_only = string.lower()
    list = segment.cut(chinese_only)
    meaningful_words = [w for w in list if w.strip()]
    return meaningful_words
print ",".join(clean(u"jieba分词"))
jieba,分词
dataset["keywords"] = dataset["query"].map(clean)
Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/jw/_2n4n7v92sxg2sckwl6k8kt00000gp/T/jieba.cache
Loading model cost 0.408 seconds.
Prefix dict has been built succesfully.

统计top20单词

# 统计词频
word_freq = {}
for words in dataset["keywords"]:
    for word in words:
        if len(word) <= 1:
            continue
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
# 排序
sort_word = []
for word,freq in word_freq.items():
    sort_word.append((word,freq))
sorted_word = sorted(sort_word,key = lambda x:x[1], reverse = True)
# 选取top word
x = []
y = []
for word in sorted_word[:20]:
    x.append(word[0])
    y.append(word[1])        

图表展示

fontset = FontProperties(fname='/System/Library/Fonts/PingFang.ttc')  # 环境:macos, 解决pyplot中文乱码问题
x_pos = np.arange(len(x))
plt.figure(figsize=(15,7))
plt.xticks(x_pos,x, fontproperties=fontset)
plt.bar(x_pos,y,align='center',color='green',alpha=0.8)  
plt.title(u"2017年top20新闻单词", fontproperties=fontset)
plt.show()  

png

留下评论