利用贝叶斯进行垃圾邮件过滤代码
这里提供一个简单的用 Python 实现贝叶斯算法的垃圾邮件过滤例子:
import os
import re
import math
from collections import defaultdict
# 读取文件并返回文本字符串
def read_file(path):
with open(path, 'r', encoding='utf-8') as f:
return f.read()
# 将文本字符串转换为单词列表
def tokenize(text):
words = re.findall('[a-z]+', text.lower())
return set(words)
# 计算训练集中各个单词的概率
def train_set(directory):
# 统计每个单词在垃圾邮件和非垃圾邮件中出现的次数
spam_counts = defaultdict(int)
ham_counts = defaultdict(int)
for filename in os.listdir(directory):
path = os.path.join(directory, filename)
if 'spam' in filename:
for word in tokenize(read_file(path)):
spam_counts[word] += 1
else:
for word in tokenize(read_file(path)):
ham_counts[word] += 1
# 计算单词在垃圾邮件和非垃圾邮件中的概率
word_prob = {}
spam_total = sum(spam_counts.values())
ham_total = sum(ham_counts.values())
for word in set(list(spam_counts.keys()) + list(ham_counts.keys())):
word_prob[word] = (
((spam_counts[word] + 1) / (spam_total + 2)),
((ham_counts[word] + 1) / (ham_total + 2))
)
return word_prob
# 计算邮件是垃圾邮件的概率
def spam_probability(word_probs, message):
message_words = tokenize(message)
spam_prob = ham_prob = 0.5
for word, (spam_prob_word, ham_prob_word) in word_probs.items():
if word in message_words:
spam_prob *= spam_prob_word
ham_prob *= ham_prob_word
else:
spam_prob *= (1 - spam_prob_word)
ham_prob *= (1 - ham_prob_word)
return spam_prob / (spam_prob + ham_prob)
# 测试函数
def test_spam_filter(directory):
word_probs = train_set(directory)
spam_files = [f for f in os.listdir(directory) if 'spam' in f]
ham_files = [f for f in os.listdir(directory) if 'ham' in f]
spam_count = ham_count = 0
for filename in spam_files:
path = os.path.join(directory, filename)
if spam_probability(word_probs, read_file(path)) > 0.5:
spam_count += 1
for filename in ham_files:
path = os.path.join(directory, filename)
if spam_probability(word_probs, read_file(path)) <= 0.5:
ham_count += 1
spam_precision = spam_count / len(spam_files)
ham_precision = ham_count / len(ham_files)
print('垃圾邮件准确率:', spam_precision)
print('非垃圾邮件准确率:', ham_precision)
if __name__ == '__main__':
test_spam_filter('email')
这个例子使用了一个包含垃圾邮件和非垃圾邮件的训练集来训练模型,该训练集放在名为“email”的文件夹中。对于每个单词,我们记录它在垃圾邮件和非垃圾邮件中出现的次数,并使用贝叶斯算法计算单词在垃圾邮件和非垃圾邮件中的概率。然后,对于每个邮件,我们使用这些单词概率来计算其是垃圾邮件的概率,并将该邮件归类为垃圾邮件或非垃圾邮件。
该例子仅作演示用途,实际上垃圾邮件过滤需要更多的处理和优化,例如使用更复杂的特征提取方法,处理归一化问题,等等。
该博文为原创文章,未经博主同意不得转。本文章博客地址:https://cplusplus.blog.csdn.net/article/details/133968692