defgetStopWords(self, path): for line in open(path): self.stopWords.append(line[:len(line)-1])
defgetWordsList(self, content, wordsList): cutRes = list(jieba.cut(content)) for w in cutRes: if w notin self.stopWords and w.strip() != ''and w != Noneand w notin wordsList: wordsList.append(w)
defgetWords(self, path): wordsDic = {} fileList = os.listdir(path) num = len(fileList) for fileName in fileList: wordsList = [] for line in open(path + fileName): rule = re.compile(r"[^\u4e00-\u9fa5]") line = rule.sub("", line) self.getWordsList(line, wordsList) for item in wordsList: if item in wordsDic.keys(): wordsDic[item] += 1 else: wordsDic.setdefault(item, 1) return wordsDic, num
defgetTestWords(self, wordsDic): wordsProbDic = {} for word, num in wordsDic.items(): if word in self.spamDict.keys() and word in self.normDict.keys(): pw_s = self.spamDict[word]/self.spamFileNum pw_n = self.normDict[word]/self.normFileNum elif word in self.spamDict.keys() and word notin self.normDict.keys(): pw_s = self.spamDict[word]/self.spamFileNum pw_n = 0.0001# 不在正常邮件词中 elif word notin self.spamDict.keys() and word in self.normDict.keys(): pw_s = 0.0001# 不在垃圾邮件词中 pw_n = self.normDict[word]/self.normFileNum else: # 二者都不在,设置默认可能性,根据最新(2020-06)全球垃圾邮件比例设置 pw_s = 0.4816 pw_n = 0.5184 ps_w = pw_s / (pw_s + pw_n) # 计算 P(s|w) wordsProbDic.setdefault(word, ps_w) # res = {} # for w in sorted(wordsProbDic.items(), key=lambda d: d[1], reverse=True)[0:15]: # res.setdefault(w[0], w[1]) return wordsProbDic
defcalBayes(self, wordsProbDic): ps_w = 1 pn_w = 1 for word, prob in wordsProbDic.items(): ps_w *= prob pn_w *= (1 - prob) p = ps_w / (ps_w + pn_w) return p
defjudgeSpam(self, filename): wordsDic = {} wordsList = [] for line in open(filename): rule = re.compile(r"[^\u4e00-\u9fa5]") line = rule.sub("", line) self.getWordsList(line, wordsList) for item in wordsList: if item in wordsDic.keys(): wordsDic[item] += 1 else: wordsDic.setdefault(item, 1) wordsProbDic = self.getTestWords(wordsDic) p = self.calBayes(wordsProbDic) return p > 0.9