3 Star 8 Fork 1

冰封飞飞 / 计算机英语词频统计

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
wordCount.py 3.00 KB
一键复制 编辑 原始数据 按行查看 历史
'''
词频分析主模块
'''
import asyncio
import collections
import time
from datastore import datastore
from model.models import modelHtmlText, modelWordCount
import statistics
from log import log
import settings
logging = log(filename=settings.WORDCOUNT_LOG_FILE, level=log.INFO)
class wordCount:
'''
读取数据库htmltext表
'''
SLEEP_TIME = 2
CYCLE_GAP_TIME = 0.1
IGNORE_WORD = [chr(x) for x in range(ord('a'), ord('z') + 1) if chr(x) not in 'ai']
def __init__(self):
self.datastore = datastore()
self.STAT_FUNC = {'wikipedia' : self.__wikiStat}
self.logging = logging
def __wikiStat(self, wordList):
'''
处理wikipedia单词的统计
'''
statistics.addwikiDoneLinkCount(1)
statistics.decwikiPreAnalyzeHtmlCount(1)
statistics.addwikiWordCount(len(wordList))
async def count(self):
'''
统计htmltext中每条记录的词频,然后写入到wordcount表,异步执行启动方式
'''
try:
while True:
textObj = self.datastore.top(modelHtmlText)
if not textObj:
await asyncio.sleep(wordCount.SLEEP_TIME)
continue
site = textObj.site
text = textObj.text
textList = text.split()
textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割,然后组成(单词,词频)的tuple的list
self.datastore.updateWordCount(wordTuples)
self.STAT_FUNC[site](textList)
await asyncio.sleep(wordCount.CYCLE_GAP_TIME)
except Exception as e:
self.logging.error("wordCount count exception={}".format(e))
def syncCount(self):
'''
统计htmltext中每条记录的词频,然后写入到wordcount表,同步执行启动方式
'''
try:
while True:
textObj = self.datastore.top(modelHtmlText)
if not textObj:
self.logging.warning('no text')
time.sleep(wordCount.SLEEP_TIME)
continue
site = textObj.site
text = textObj.text
textList = text.split()
textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割,然后组成(单词,词频)的tuple的list
self.datastore.updateWordCount(wordTuples)
self.STAT_FUNC[site](textList)
except Exception as e:
self.logging.error("wordCount sync count exception={}".format(e))
def main():
try:
asyncio.run(wordCount().count())
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main()
Python
1
https://gitee.com/bingfengfeifei/wordCount.git
git@gitee.com:bingfengfeifei/wordCount.git
bingfengfeifei
wordCount
计算机英语词频统计
master

搜索帮助