分类是机器学习和数据挖掘中最基础的一种工作。假设现在我们有一组训练样本,以及与其相对应的分类标签。每个元组都被表示为n维属性向量 x=(x1,x1,…,xn) 的形式,一共有k个类别 c1,c2,…,ck。分类要做的就是模型可以预测数据属于哪个类别。 对于每个类别 ci ,利用贝叶斯公式来估计在给定训练元组X时的条件概率 P(ci|x)
P(ci|x)=P(ci)P(x|ci)P(x)
当且仅当概率 P(ci|x) 在所有类别中取值最大时,数据 x 属于 ci 。 P(ci) 是类先验概率,P(x|ci) 是样本 x 相对于类 ci 的类条件概率,称为似然。因为 P(x) 是用于归一化的证据因子,其对于所有的类别都是恒定的。所以只需要基于训练数据来估计 P(ci) 和 P(x|ci)
defgetStopWords(self, path): for line in open(path): self.stopWords.append(line[:len(line)-1])
defgetWordsList(self, content, wordsList): cutRes = list(jieba.cut(content)) for w in cutRes: if w notin self.stopWords and w.strip() != ''and w != Noneand w notin wordsList: wordsList.append(w)
defgetWords(self, path): wordsDic = {} fileList = os.listdir(path) num = len(fileList) for fileName in fileList: wordsList = [] for line in open(path + fileName): rule = re.compile(r"[^\u4e00-\u9fa5]") line = rule.sub("", line) self.getWordsList(line, wordsList) for item in wordsList: if item in wordsDic.keys(): wordsDic[item] += 1 else: wordsDic.setdefault(item, 1) return wordsDic, num
defgetTestWords(self, wordsDic): wordsProbDic = {} for word, num in wordsDic.items(): if word in self.spamDict.keys() and word in self.normDict.keys(): pw_s = self.spamDict[word]/self.spamFileNum pw_n = self.normDict[word]/self.normFileNum elif word in self.spamDict.keys() and word notin self.normDict.keys(): pw_s = self.spamDict[word]/self.spamFileNum pw_n = 0.0001# 不在正常邮件词中 elif word notin self.spamDict.keys() and word in self.normDict.keys(): pw_s = 0.0001# 不在垃圾邮件词中 pw_n = self.normDict[word]/self.normFileNum else: # 二者都不在,设置默认可能性,根据最新(2020-06)全球垃圾邮件比例设置 pw_s = 0.4816 pw_n = 0.5184 ps_w = pw_s / (pw_s + pw_n) # 计算 P(s|w) wordsProbDic.setdefault(word, ps_w) # res = {} # for w in sorted(wordsProbDic.items(), key=lambda d: d[1], reverse=True)[0:15]: # res.setdefault(w[0], w[1]) return wordsProbDic
defcalBayes(self, wordsProbDic): ps_w = 1 pn_w = 1 for word, prob in wordsProbDic.items(): ps_w *= prob pn_w *= (1 - prob) p = ps_w / (ps_w + pn_w) return p
defjudgeSpam(self, filename): wordsDic = {} wordsList = [] for line in open(filename): rule = re.compile(r"[^\u4e00-\u9fa5]") line = rule.sub("", line) self.getWordsList(line, wordsList) for item in wordsList: if item in wordsDic.keys(): wordsDic[item] += 1 else: wordsDic.setdefault(item, 1) wordsProbDic = self.getTestWords(wordsDic) p = self.calBayes(wordsProbDic) return p > 0.9