使用pool多进程无法正确的执行多进程。而是一个任务一个任务的运行。
import os
import time
import random
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool
import numpy as np
import pandas as pd
from collections import defaultdict
import itertools
# 读取病毒与GOterm关系
def read_virus_go(file_path):
virus_to_go = {}
df = pd.read_csv(file_path, sep='\t', index_col=0)
for idx, row in df.iterrows():
cols_with_1 = row[row == 1].index.tolist()
virus_to_go[idx] = cols_with_1
return virus_to_go
# 读取GOterm间的相似性
def read_go_similarity(file_path):
go_similarity = defaultdict(np.float32)
with open(file_path, 'r') as f:
for line in f:
go1, go2, _, sim = line.strip().split()
if go1 > go2:
go1, go2 = go2, go1
go_similarity[(go1, go2)] = np.float32(sim)
return go_similarity
# 计算Sim(go, G1)
def calc_sim_go_to_g(go, g_terms, go_similarity):
# 如果go在g_terms中,返回1
if go in g_terms:
return 1
return max(go_similarity[(min(go, g), max(go, g))] if (min(go, g), max(go, g)) in go_similarity else 0 for g in g_terms)
# 计算Sim(G1, G2)
def calc_sim_g1_g2(g1_terms, g2_terms, go_similarity):
m, n = len(g1_terms), len(g2_terms)
if m == 0 or n == 0:
return 0
sim_g1_g2 = sum(calc_sim_go_to_g(go, g2_terms, go_similarity) for go in g1_terms)
sim_g2_g1 = sum(calc_sim_go_to_g(go, g1_terms, go_similarity) for go in g2_terms)
return (sim_g1_g2 + sim_g2_g1) / (m + n)
# 计算相似度的任务函数
def compute_similarity_for_pair(pair, virus_to_go, disease_to_go, go_similarity):
v1, v2 = pair
sim = calc_sim_g1_g2(virus_to_go[v1], disease_to_go[v2], go_similarity)
return (v1, v2, sim)
# 主函数
def main(virus_go_file, disease_go_file, go_similarity_file, output_file):
# 读取数据
virus_to_go = read_virus_go(virus_go_file)
disease_to_go = read_virus_go(disease_go_file)
go_similarity = read_go_similarity(go_similarity_file)
# 病毒和疾病列表
viruses = list(virus_to_go.keys())
diseases = list(disease_to_go.keys())
# 生成病毒和疾病对
viruses_pairs = list(itertools.product(viruses, diseases))[:10]
# 随机分成10份
random.shuffle(viruses_pairs)
num_splits = 5
split_pairs = [viruses_pairs[i::num_splits] for i in range(num_splits)]
# 使用多进程进行计算
with Pool(processes=num_splits) as pool:
# 每个进程处理一部分任务
results = pool.starmap(compute_similarity_for_pair,
[(pair, virus_to_go, disease_to_go, go_similarity) for pair in itertools.chain(*split_pairs)])
# 保存结果
df = pd.DataFrame(results, columns=['Virus1', 'Virus2', 'Similarity'])
df.to_csv(output_file, index=False, sep='\t')
print(f"结果已保存到 {output_file}")
if __name__ == '__main__':
# 文件路径(需要根据您的文件路径修改)
virus_go_file = "../allGO/allGO_Virus"
disease_go_file = "../allGO/allGO_Disease"
go_similarity_file = "./id_combinations/out/allGoTermSmi.txt"
output_file = "./result/virusAndDisease_similarity_results.txt"
main(virus_go_file,disease_go_file, go_similarity_file, output_file)