-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmethod.py
More file actions
162 lines (135 loc) · 6.21 KB
/
method.py
File metadata and controls
162 lines (135 loc) · 6.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#!/usr/bin/env python
import heapq
from operator import *
import utils
import config
def iterate_mv(network, annotated_genes, go_num):
unannotated_genes = list()
for gene in network.nodes():
if not gene in annotated_genes:
unannotated_genes.append(gene)
predicted_genes = {}
while len(unannotated_genes) > 0:
for gene in network.nodes():
if not gene in annotated_genes:
neighbor_terms = {}
for neighbour in network.neighbors(gene):
if neighbour in annotated_genes:
for term in annotated_genes[neighbour]:
neighbor_terms[term] = neighbor_terms.get(term, 0) + 1
elif neighbour in predicted_genes:
for term in predicted_genes[neighbour]:
neighbor_terms[term] = neighbor_terms.get(term, 0) + 1
if len(neighbor_terms) > 0:
# Select top go_num terms as predicted GO terms for the gene
top_terms = heapq.nlargest(go_num, neighbor_terms.iteritems(), itemgetter(1))
if gene in predicted_genes:
del predicted_genes[gene]
predicted_genes[gene] = []
for rec in top_terms:
predicted_genes[gene].append(rec[0])
if gene in unannotated_genes:
unannotated_genes.remove(gene)
else:
continue
return predicted_genes
def iterate_weighted_mv(network, annotated_genes, go_num):
predicted_genes = {}
iter = 0
last_sum = -1.0
ITERATION = 20
sim_cache = utils.read_sim("pfalciparum_data/modified_wang_sim.csv")
while iter < ITERATION:
total_sum = 0.0
for gene in network.nodes():
if not gene in annotated_genes:
candidate_terms = get_candidate_terms(network, gene, annotated_genes, predicted_genes)
cterm_sim_sum = {}
for cterm in candidate_terms:
sim_sum = 0.0
# For each neighbour of gene
for neighbour in network.neighbors(gene):
if neighbour in annotated_genes:
max_sim = -1.0
for nterm in annotated_genes[neighbour]:
new_sim = sim_cache[cterm][nterm]
if new_sim > max_sim:
max_sim = new_sim
if gene in predicted_genes:
weight = compute_gene_sim(predicted_genes[gene], annotated_genes[neighbour], sim_cache)
else:
weight = compute_gene_sim([cterm], annotated_genes[neighbour], sim_cache)
sim_sum += 1.0 * weight * max_sim
elif neighbour in predicted_genes:
max_sim = -1.0
for nterm in predicted_genes[neighbour]:
new_sim = sim_cache[cterm][nterm]
if new_sim > max_sim:
max_sim = new_sim
if gene in predicted_genes:
weight = compute_gene_sim(predicted_genes[gene], predicted_genes[neighbour], sim_cache)
else:
weight = compute_gene_sim([cterm], predicted_genes[neighbour], sim_cache)
sim_sum += 1.0 * weight * max_sim
cterm_sim_sum[cterm] = sim_sum
if len(candidate_terms) > 0:
# Select top go_num terms as predicted GO terms for the gene
top_terms = heapq.nlargest(go_num, cterm_sim_sum.iteritems(), itemgetter(1))
if gene in predicted_genes:
del predicted_genes[gene]
predicted_genes[gene] = []
for rec in top_terms:
predicted_genes[gene].append(rec[0])
total_sum = compute_total_sim(network, annotated_genes, predicted_genes, sim_cache)
diff = int(total_sum) - int(last_sum)
if diff==0:
break
else:
last_sum = total_sum
iter += 1
return predicted_genes
def get_candidate_terms(network, gene, annotated_genes, predicted_genes):
# Add neighbor terms to candidate list
candidate_terms = []
term_dict = {}
for neighbour in network.neighbors(gene):
if neighbour in annotated_genes:
for term in annotated_genes[neighbour]:
if not term in term_dict:
term_dict[term] = 1
candidate_terms.append(term)
elif neighbour in predicted_genes:
for term in predicted_genes[neighbour]:
if not term in term_dict:
term_dict[term] = 1
candidate_terms.append(term)
return candidate_terms
def compute_gene_sim(gene1_terms, gene2_terms, sim_cache):
"""Compute the similarity between two genes. Each gene is represented using a list of terms
"""
# Use max sim between two terms as the sim between two genes
max_sim = -1.0
for term1 in gene1_terms:
for term2 in gene2_terms:
sim = sim_cache[term1][term2]
if sim > max_sim:
max_sim = sim
return max_sim
def compute_total_sim(network, annotated_genes, predicted_genes, sim_cache):
sum = 0.0
for gene in network.nodes():
if gene in annotated_genes:
gene_terms = annotated_genes[gene]
elif gene in predicted_genes:
gene_terms = predicted_genes[gene]
for neighbor in network.neighbors(gene):
if neighbor in annotated_genes:
sim = compute_gene_sim(gene_terms, annotated_genes[neighbor], sim_cache)
elif neighbor in predicted_genes:
sim = compute_gene_sim(gene_terms, predicted_genes[neighbor], sim_cache)
sum += sim
return sum
def main():
pass
if __name__ == "__main__":
main()