代码拉取完成,页面将自动刷新
'''
词频分析主模块
'''
import asyncio
import collections
import time
from datastore import datastore
from model.models import modelHtmlText, modelWordCount
import statistics
from log import log
import settings
logging = log(filename=settings.WORDCOUNT_LOG_FILE, level=log.INFO)
class wordCount:
'''
读取数据库htmltext表
'''
SLEEP_TIME = 2
CYCLE_GAP_TIME = 0.1
IGNORE_WORD = [chr(x) for x in range(ord('a'), ord('z') + 1) if chr(x) not in 'ai']
def __init__(self):
self.datastore = datastore()
self.STAT_FUNC = {'wikipedia' : self.__wikiStat}
self.logging = logging
def __wikiStat(self, wordList):
'''
处理wikipedia单词的统计
'''
statistics.addwikiDoneLinkCount(1)
statistics.decwikiPreAnalyzeHtmlCount(1)
statistics.addwikiWordCount(len(wordList))
async def count(self):
'''
统计htmltext中每条记录的词频,然后写入到wordcount表,异步执行启动方式
'''
try:
while True:
textObj = self.datastore.top(modelHtmlText)
if not textObj:
await asyncio.sleep(wordCount.SLEEP_TIME)
continue
site = textObj.site
text = textObj.text
textList = text.split()
textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割,然后组成(单词,词频)的tuple的list
self.datastore.updateWordCount(wordTuples)
self.STAT_FUNC[site](textList)
await asyncio.sleep(wordCount.CYCLE_GAP_TIME)
except Exception as e:
self.logging.error("wordCount count exception={}".format(e))
def syncCount(self):
'''
统计htmltext中每条记录的词频,然后写入到wordcount表,同步执行启动方式
'''
try:
while True:
textObj = self.datastore.top(modelHtmlText)
if not textObj:
self.logging.warning('no text')
time.sleep(wordCount.SLEEP_TIME)
continue
site = textObj.site
text = textObj.text
textList = text.split()
textList = [x for x in textList if x not in wordCount.IGNORE_WORD]
wordTuples = collections.Counter(textList).most_common() #将单词从文章中分割,然后组成(单词,词频)的tuple的list
self.datastore.updateWordCount(wordTuples)
self.STAT_FUNC[site](textList)
except Exception as e:
self.logging.error("wordCount sync count exception={}".format(e))
def main():
try:
asyncio.run(wordCount().count())
except KeyboardInterrupt:
pass
if __name__ == '__main__':
main()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。