Python分词

依赖

jieba 分词
pandas
numpy

过程

1 合并文本并删除其中的标点符号

 
def mergeData(commit_list_all):
    comments = ''
    for c in   commit_list_all :
        comments = comments + (str(c)).strip()

    cleaned_comments = ''.join(re.findall(u'[\u4e00-\u9fff]+', comments.decode("utf-8")))
return cleaned_comments

2 删除数据中停用词

def cleanStopWords(cleaned_comments):
    segment = jieba.lcut(cleaned_comments)
    words_df=pd.DataFrame({'segment':segment})
    stopwords=pd.read_csv("stop_words_zh_UTF-8.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用

3 关键词归类排序


    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]


    words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})

    words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
    #print words_stat
    return words_stat

保存数据库

"3","都","47"
"4","吴京","35"
"5","不","32"
"6","电影","32"
"7","中国","28"
"9","动作","27"
"8","人","27"
"10","很","26"
"11","还","22"
"12","看","21"
"13","一个","17"
"14","上","15"
"16","大片","14"
"15","战狼","14"
"17","这部","13"
"18","好莱坞","13"

附获取豆瓣评论并分词存库的代码

# --*-- coding:utf-8 --*--
__author__ = 'licha'
import  sys
reload(sys)
sys.setdefaultencoding("utf-8")
import spider
from bs4 import  BeautifulSoup as bs
import re
import jieba    #分词包
import pandas as pd
import numpy    #numpy计算包

import database.mysql as db

#获取某页
def getPage(pageNum):
    start = pageNum * 20 - 1
    url = "https://movie.douban.com/subject/26363254/comments?start={0}&limit=20&sort=new_score&status=P".format(start)
    html = spider.gethtml(url)
    return html
#spider.cookie = "   "

#获取页面中评论列表
def getSubject(html):
    commit_list = []
    soup = bs(html,"html.parser")
    commit_list_p = soup.select(".comment > p")
    for comment in commit_list_p:

        commit_list.append(comment.get_text())
    return commit_list

#获取所有评论数据
def getAllData():
    commit_list_all = []
    for i in range(0,5):
        html = getPage(i)
        commit_list = getSubject(html)
        commit_list_all.extend(commit_list)
    return commit_list_all

#合并所有评论
def mergeData(commit_list_all):
    comments = ''
    for c in   commit_list_all :
        comments = comments + (str(c)).strip()

    cleaned_comments = ''.join(re.findall(u'[\u4e00-\u9fff]+', comments.decode("utf-8")))

    return cleaned_comments
def cleanStopWords(cleaned_comments):
    segment = jieba.lcut(cleaned_comments)
    words_df=pd.DataFrame({'segment':segment})
    stopwords=pd.read_csv("stop_words_zh_UTF-8.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用

    words_df=words_df[~words_df.segment.isin(stopwords.stopword)]


    words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})

    words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
    #print words_stat
    return words_stat
def showPic(words_stat):
    import matplotlib.pyplot as plt
   # %matplotlib inline

    import matplotlib
    matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
    from wordcloud import WordCloud#词云包

    wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80) #指定字体类型、字体大小和字体颜色
    wordcloud = wordcloud.fit_words(words_stat.head(100).itertuples(index=False))

    plt.axis("off")
    plt.show()
def saveKeyWord(name,words_stat):
    for key,count in words_stat.head(100).itertuples(index=False):
        print count,key
        sql = "insert into doubankeyword (name,keyword,count)values('{0}','{1}',{2})".format(name,key,count)
        db.execute(sql)
    #sql = "insert into doubankeyword (name,keyword,count)values('{0}','{1}',{2})".format(name)

def main():
    commit_list_all = getAllData()
    comments = mergeData(commit_list_all)
    words_stat = cleanStopWords(comments)
    saveKeyWord("战狼2", words_stat)
    #showPic(words_stat)
    print comments

if __name__ == "__main__":
    print("你好")

    main()
    #print main()