用Python統(tǒng)計(jì)詞頻
def (astr):
# astr.replace("n", "")
slist = list(astr.split("t"))
alist = []
[alist.append(i) for i in slist if i not in alist]
alist[-1] = alist[-1].replace("n", "")
return alist
if __name__ == "__main__":
code_doc = {}
with open("test_data.txt", "r", encoding='utf-8') as fs:
for ln in fs.readlines():
l = (ln)
for t in l:
if t not in code_doc:
code_doc.(t, 1)
else:
code_doc[t] += 1
for keys in code_doc.keys():
print(keys + ' ' + str(code_doc[keys]))
求看python 統(tǒng)計(jì)中文詞頻的代碼,有一個(gè)地方不懂 求大神
首先要說(shuō)明一個(gè)概念:gbk編碼里一個(gè)中文字符的‘長(zhǎng)度’是2。
str?=?'*'??#gbk編碼
要取得'中'這個(gè)字符,需要用分片str[0:2],而不是索引str[0]。
以z4為例,下面這些代碼的效果是這樣的。
x?=?'同舟共濟(jì)與時(shí)俱進(jìn)艱苦奮斗'
i+=?z4.findall(x)?#?返回['同舟共濟(jì)','與時(shí)俱進(jìn)',?'艱苦奮斗']
i+=?z4.findall(x[2:])?#?返回['舟共濟(jì)與',?'時(shí)俱進(jìn)艱']
i+=?z4.findall(x[4:])?#?返回['共濟(jì)與時(shí)',?'俱進(jìn)艱苦']
i+=?z4.findall(x[6:])?#?返回['濟(jì)與時(shí)俱',?'進(jìn)艱苦奮']
目的是取得所有連續(xù)4字中文字符串。
如何用python實(shí)現(xiàn)英文短文的雙詞頻統(tǒng)計(jì)
簡(jiǎn)單版:
#!/usr/bin/env?python3
import?re
import?jieba
from??import?Counter
fname?=?'counttest.txt'
with?open(fname)?as?f:
????s?=?f.read()
pattern?=?re.compile(r'[a-zA-Z]+-?[a-zA-Z]*')
english_words?=?Counter(pattern.findall(s))
other_words?=?Counter(jieba.cut(pattern.sub('',?s)))
print('n英文單詞統(tǒng)計(jì)結(jié)果:n'+'-'*17)
print('n'.join(['{}:?{}'.format(i,?j)?for?i,?j?in?english_words.most_common()]))
print('n中文及符號(hào)統(tǒng)計(jì)結(jié)果:n'+'-'*19)
print('n'.join(['{}:?{}'.format(i,?j)?for?i,?j?in?other_words.most_common()]))
復(fù)雜版:
#!/usr/bin/env?python
#?-*-?coding:?utf-8?-*-
from?__future__?import?print_function,?division,?unicode_literals
import?sys,?re,?time,?os,?jieba
from??import?Counter
from?datetime?import?datetime
class?(object):
????def?__init__(self,?from_file,?to_file=None,?coding=None,?jieba_cut=None):
????????'''根據(jù)設(shè)定的進(jìn)程數(shù),把文件from_file分割成大小基本相同,數(shù)量等同與進(jìn)程數(shù)的文件段,
????????來(lái)讀取并統(tǒng)計(jì)詞頻,然后把結(jié)果寫入to_file中,當(dāng)其為None時(shí)直接打印在終端或命令行上。
????????Args:
????????@from_file?要讀取的文件
????????@to_file?結(jié)果要寫入的文件
????????@coding?文件的編碼方式,默認(rèn)為采用chardet模塊讀取前1萬(wàn)個(gè)字符來(lái)自動(dòng)判斷
????????@jieba_cut?是否啟用結(jié)巴分詞,默認(rèn)為None
????????
????????How?to?use:
????????w?=?('a.txt',?'b.txt')
????????w.run()????????
????????'''
????????if?not?os.path.isfile(from_file):
????????????raise?Exception('No?such?file:?文件不存在')
????????self.f1?=?from_file
????????self.filesize?=?os.path.getsize(from_file)
????????self.f2?=?to_file
????????if?coding?is?None:
????????????try:
????????????????import?chardet
????????????except?:
????????????????os.system('pip?install?chardet')
????????????????print('-'*70)
????????????????import?chardet
????????????with?open(from_file,?'rb')?as?f:????
????????????????coding?=?chardet.detect(f.read(10000))['encoding']????????????
????????self.coding?=?coding
????????self._c?=?[Counter(),?Counter()]
????????self.jieba?=?False
????????if?jieba_cut?is?not?None:??????????????????
????????????self.jieba?=?True
????????????
????def?run(self):
????????start?=?time.time()
????????if?1:
????????????self.count_direct(self.f1)??????????
????????if?self.f2?not?in?['None',?'Null',?'none',?'null',?None]:
????????????with?open(self.f2,?'wb')?as?f:
????????????????f.write(self.result.encode(self.coding))
????????else:
????????????print('nEnglish?words:n'?+?'-'*15)
????????????print(self.result)
????????cost?=?'{:.1f}'.format(time.time()-start)
????????size?=?humansize(self.filesize)
????????tip?=?'nFile?size:?{}.?Cost?time:?{}?seconds'?????
#????????print(tip.format(size,?cost))
????????self.cost?=?cost?+?'s'
????def?count_direct(self,?from_file):
????????'''直接把文件內(nèi)容全部讀進(jìn)內(nèi)存并統(tǒng)計(jì)詞頻'''
????????start?=?time.time()
????????with?open(from_file,?'rb')?as?f:
????????????line?=?f.read()
????????for?i?in?range(len(self._c)):
????????????self._c[i].update(self.parse(line)[i])??
?????????????????
????????????????????
????def?parse(self,?line):??#解析讀取的文件流
????????text?=?line.decode(self.coding)
????????text?=?re.sub(r'-n',?'',?text)?#考慮同一個(gè)單詞被分割成兩段的情況,刪除行末的-號(hào)
????????pattern?=?re.compile(r'[a-zA-Z]+-?[a-zA-Z]*')?#判斷是否為英文單詞
????????english_words?=?pattern.findall(text)
????????rest?=?pattern.sub('',?text)????????
????????ex?=?Counter(jieba.cut(rest))?if?self.jieba?else?Counter(text)
????????return?Counter(english_words),?ex
????????
????def?flush(self):??#清空統(tǒng)計(jì)結(jié)果
????????self._c?=?[Counter(),?Counter()]
????@property
????def?counter(self):??#返回統(tǒng)計(jì)結(jié)果的Counter類???????
????????return?self._c
????????????????????
????@property
????def?result(self):??#返回統(tǒng)計(jì)結(jié)果的字符串型式,等同于要寫入結(jié)果文件的內(nèi)容
????????ss?=?[]
????????for?c?in?self._c:
????????????ss.append(['{}:?{}'.format(i,?j)?for?i,?j?in?c.most_common()])
????????
????????tip?=?'nn中文及符號(hào)統(tǒng)計(jì)結(jié)果:n'+'-'*15+'n'
????????return?tip.join(['n'.join(s)?for?s?in?ss])
def?humansize(size):
????"""將文件的大小轉(zhuǎn)成帶單位的形式
????>>>?humansize(1024)?==?'1?KB'
????True
????>>>?humansize(1000)?==?'1000?B'
????True
????>>>?humansize(1024*1024)?==?'1?M'
????True
????>>>?humansize(1024*1024*1024*2)?==?'2?G'
????True
????"""
????units?=?['B',?'KB',?'M',?'G',?'T']????
????for?unit?in?units:
????????if?size?<?1024:
????????????break
????????size?=?size?//?1024
????return?'{}?{}'.format(size,?unit)
????????
def?main():
????if?len(sys.argv)?<?2:
????????print('Usage:?python?.py?from_file?to_file')
????????exit(1)
????from_file,?to_file?=?sys.argv[1:3]
????args?=?{'coding'?:?None,?'jieba_cut':?1}
????for?i?in?sys.argv:
????????for?k?in?args:
????????????if?re.search(r'{}=(.+)'.format(k),?i):
????????????????args[k]?=?re.findall(r'{}=(.+)'.format(k),?i)[0]
????w?=?(from_file,?to_file,?**args)
????w.run()
????
if?__name__?==?'__main__':
????import?doctest
????doctest.testmod()
????main()
更復(fù)雜的:如果是比較大的文件,建議采用多進(jìn)程,詳情百度:多進(jìn)程讀取大文件并統(tǒng)計(jì)詞頻
求這個(gè)python詞頻問(wèn)題的詳細(xì)代碼,題目如圖?
使用Python寫文件的時(shí)候,或者將網(wǎng)絡(luò)數(shù)據(jù)流寫入到本地文件的時(shí)候,大部分情況下會(huì)遇到:: 'gbk' codec can't encode character 'xa0' in position ... 這個(gè)問(wèn)題。 網(wǎng)絡(luò)上有很多類似的文件講述如何解決這個(gè)問(wèn)題,但是無(wú)非就是encode,decode相關(guān)的,這是導(dǎo)致該問(wèn)題出現(xiàn)的真正原因嗎?不是的。 很多時(shí)候,我們使用了decode和encode,試遍了各種編碼,utf8,utf-8,gbk,gb2312等等,該有的編碼都試遍了,可是編譯的時(shí)候仍然出現(xiàn): : 'gbk' codec can't encode character 'xa0' in position XXX。 崩潰了。
在windows下面編寫python腳本,編碼問(wèn)題很嚴(yán)重。
將網(wǎng)絡(luò)數(shù)據(jù)流寫入文件時(shí)時(shí),我們會(huì)遇到幾個(gè)編碼:
1: #encoding='XXX' 這里(也就是python文件*行的內(nèi)容)的編碼是指該python腳本文件本身的編碼,無(wú)關(guān)緊要。只要XXX和文件本身的編碼相同就行了。 比如notepad++ "格式"菜單里面里可以設(shè)置各種編碼,這時(shí)需要保證該菜單里設(shè)置的編碼和encoding XXX相同就行了,不同的話會(huì)報(bào)錯(cuò)
2:網(wǎng)絡(luò)數(shù)據(jù)流的編碼 比如獲取網(wǎng)頁(yè),那么網(wǎng)絡(luò)數(shù)據(jù)流的編碼就是網(wǎng)頁(yè)的編碼。需要使用decode解碼成unicode編碼。
3:目標(biāo)文件的編碼 要將網(wǎng)絡(luò)數(shù)據(jù)流的編碼寫入到新文件,那么我么需要指定新文件的編碼。寫文件代碼如:
復(fù)制代碼代碼如下:
f.write(txt)
,那么txt是一個(gè)字符串,它是通過(guò)decode解碼過(guò)的字符串。關(guān)鍵點(diǎn)就要來(lái)了:目標(biāo)文件的編碼是導(dǎo)致標(biāo)題所指問(wèn)題的罪魁禍?zhǔn)?。如果我們打開一個(gè)文件:
復(fù)制代碼代碼如下:
f = open("out.html","w")
,在windows下面,新文件的默認(rèn)編碼是gbk,這樣的話,python解釋器會(huì)用gbk編碼去解析我們的網(wǎng)絡(luò)數(shù)據(jù)流txt,然而txt此時(shí)已經(jīng)是decode過(guò)的unicode編碼,這樣的話就會(huì)導(dǎo)致解析不了,出現(xiàn)上述問(wèn)題。 解決的辦法就是,改變目標(biāo)文件的編碼:
復(fù)制代碼代碼如下:
f = open("out.html","w",encoding='utf-8')
--------------------------------------------------------(以下為個(gè)人遇到的問(wèn)題)
個(gè)人編碼:“encoding='utf-8'”解決encode(編碼)和decode(譯碼)的問(wèn)題
文章*發(fā)布于: 2021-08-18
版權(quán)聲明:本文為博主原創(chuàng)文章,遵循 CC 4.0 BY-SA 版權(quán)協(xié)議,轉(zhuǎn)載請(qǐng)附上原文出處鏈接和本聲明。
本文鏈接: