读取小说文件夹 自动读取编码 自动添加编码格式 自动转格式 遇到的问题
import os
import jieba.posseg as pseg
import chardet
import codecs
# 文件夹路径
source_folder = "/storage/emulated/0/文件/文本处理/"
output_folder = "/storage/emulated/0/文件/分词处理/"
merged_file = "/storage/emulated/0/文件/合并后文本.txt"
# 创建存储分词后文本的文件夹
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 合并文本
def merge_text():
merged_text = ""
for filename in os.listdir(source_folder):
if filename.endswith('.txt'):
try:
# 读取文本文件的编码并转换为 UTF-8 编码
with open(os.path.join(source_folder, filename), 'rb') as f:
raw_data = f.read()
encoding = chardet.detect(raw_data)['encoding']
with codecs.open(os.path.join(source_folder, filename), 'r', encoding=encoding) as f:
text = f.read()
merged_text += text + '。' # 在每个文本末尾添加一个句号,表示一个句子结束
except Exception as e:
print(f"Error reading file {filename}: {e}")
with open(merged_file, 'w', encoding='utf-8') as f:
sentences = merged_text.split('。')
for sentence in sentences:
sentence = sentence.strip()
if sentence:
f.write(sentence + '。\n')
print("文本合并并存储完成。")
# 分词并按词性划分
def segment_and_classify():
# 读取合并后的文本内容
with open(merged_file, 'r', encoding='utf-8') as f:
text = f.read()
# 分词并获取词性
words_with_pos = pseg.cut(text)
# 存储词性分类结果的字典
pos_dict = {}
# 将词语按照词性分类,并存储到不同的文件中
for word, pos in words_with_pos:
# 去除空格和换行符
word = word.strip()
if word:
# 创建存储该词性词语的文件夹
pos_folder = os.path.join(output_folder, pos)
if not os.path.exists(pos_folder):
os.makedirs(pos_folder)
# 如果词性在字典中不存在,则创建一个列表存储对应的词语
if pos not in pos_dict:
pos_dict[pos] = []
# 去重处理
if word not in pos_dict[pos]:
pos_dict[pos].append(word)
# 将词语写入对应的文件中
for pos, words in pos_dict.items():
with open(os.path.join(output_folder, pos, f"{pos}.txt"), 'a', encoding='utf-8') as f:
for word in words:
f.write(word + '\n')
print("分词并按词性划分存储完成。")
# 执行合并文本、转换为统一编码以及分词并按词性划分操作
merge_text()
segment_and_classify()
问了一上午机器人,给出的代码总是出错
/data/user/0/org.qpython.qpy/files/bin/qpy thon3.sh "/storage/emulated/0/qpython/小说 词性划分.py" && exit
python/小说词性划分.py" && exit <
Error reading file 异界潜规则.txt: 'gb2312 ' codec can't decode byte 0x84 in position 22924: illegal multibyte sequence
文本合并并存储完成。
Building prefix dict from the default dict ionary ...
Loading model from cache /storage/emulated /0/qpython/cache/jieba.cache
Loading model cost 2.227 seconds.
Prefix dict has been built successfully.
收起

分词编码,如果有成千上万种格式,这个代码不能词性划分?
- 写回答
- 好问题 0 提建议
- 关注问题
- 邀请回答
-
5条回答 默认 最新
- 赵灵越 2024-04-18 18:41关注
import os import jieba.posseg as pseg import chardet import codecs # 文件夹路径 source_folder = "/storage/emulated/0/文件/文本处理/" output_folder = "/storage/emulated/0/文件/分词处理/" merged_file = "/storage/emulated/0/文件/合并后文本.txt" # 创建存储分词后文本的文件夹 if not os.path.exists(output_folder): os.makedirs(output_folder) def convert_to_utf8(input_file, output_file): try: # 读取原始文件的编码 with open(input_file, 'rb') as f: raw_data = f.read() encoding = chardet.detect(raw_data)['encoding'] print(f"{input_file} 的编码是:{encoding}") # 如果编码是 GB2312,则尝试以 GB2312 编码打开文件并忽略错误字符 if encoding == 'GB2312': text = raw_data.decode('GB2312', errors='ignore') encoding = 'GB2312 (processed)' else: # 使用原始编码格式打开文件并转换为 UTF-8 编码 with codecs.open(input_file, 'r', encoding=encoding) as f: text = f.read() # 将内容写入新的 UTF-8 编码文件 with open(output_file, 'w', encoding='utf-8') as f: f.write(text) print(f"转换文件 {input_file} 到 UTF-8 编码完成。") except Exception as e: print(f"转换文件 {input_file} 到 UTF-8 编码时发生错误:{e}") def merge_text(): merged_text = "" for filename in os.listdir(source_folder): if filename.endswith('.txt'): input_file = os.path.join(source_folder, filename) output_file = os.path.join(output_folder, f"{filename[:-4]}_utf8.txt") # 转换文件编码为 UTF-8 convert_to_utf8(input_file, output_file) # 读取转换后的 UTF-8 编码文件内容 with open(output_file, 'r', encoding='utf-8') as f: text = f.read() merged_text += text + '。' # 在每个文本末尾添加一个句号,表示一个句子结束 # 合并转换后的文本内容 with open(merged_file, 'w', encoding='utf-8') as f: sentences = merged_text.split('。') for sentence in sentences: sentence = sentence.strip() if sentence: f.write(sentence + '。\n') print("文本合并并存储完成。") def segment_and_classify(): # 读取合并后的文本内容 with open(merged_file, 'r', encoding='utf-8') as f: text = f.read() # 分词并获取词性 words_with_pos = pseg.cut(text) # 存储词性分类结果的字典 pos_dict = {} # 将词语按照词性分类,并存储到不同的文件中 for word, pos in words_with_pos: # 去除空格和换行符 word = word.strip() if word: # 创建存储该词性词语的文件夹 pos_folder = os.path.join(output_folder, pos) if not os.path.exists(pos_folder): os.makedirs(pos_folder) # 如果词性在字典中不存在,则创建一个列表存储对应的词语 if pos not in pos_dict: pos_dict[pos] = [] # 去重处理 if word not in pos_dict[pos]: pos_dict[pos].append(word) # 将词语写入对应的文件中 for pos, words in pos_dict.items(): with open(os.path.join(output_folder, pos, f"{pos}.txt"), 'a', encoding='utf-8') as f: for word in words: f.write(word + '\n') print("分词并按词性划分存储完成。") # 执行合并文本、转换为统一编码以及分词并按词性划分操作 merge_text() segment_and_classify()
本回答被题主选为最佳回答 , 对您是否有帮助呢?解决 无用评论 打赏 举报