生物信息学原理作业第四弹:DNA序列组装(贪婪算法)
原理:生物信息学(孙啸)
大致思想:
1. 找到权值最大的边;
2. 除去以最大权值边的起始顶点为起始顶点的边;
3. 除去以最大权值边为终点为终点的边;
4. 重复上述步骤,得到所有符合条件的边;
5. 拼接得到的边;
6. 加入孤立点(如果有)。
附上Python代码,如果有问题我会及时更正(确实不太熟算法)
转载请保留出处!
1 # -*- coding: utf-8 -*-
2 """
3 Created on Mon Dec 4 15:04:39 2017 4 @author: zxzhu 5 python3.6 6 """
7 from functools import reduce 8 def get_weight(s1,s2): #通过两条序列的overlap计算出权值
9 l = min(len(s1),len(s2)) 10 while l>0: 11 if s2[:l] == s1[-l:]: 12 return l 13 else: 14 l-=1
15 return 0 16
17 def print_result(s1,s2): #将两条序列去除首尾overlap后合并
18 weight = get_weight(s1,s2) 19 s = s1 + s2[weight:] 20 #print(s)
21 return s 22
23 def dir_graph(l,t=3): #得到满足条件的有向图(权值大于等于t)
24 graph = {} 25 for i in l: 26 for j in l: 27 if i!=j: 28 weight = get_weight(i,j) 29 if weight >= t: 30 key = (i,j) 31 graph[key] = weight 32 return graph 33
34 def rm_path(graph,path): #贪婪算法加入一条边后应该去除与该边首尾顶点相同的边
35 key = graph.keys() 36 rm_key = [] 37 for i in key: 38 if i[1] == path[1] or i[0] == path[0]: 39 rm_key.append(i) 40 for i in rm_key: 41 graph.pop(i) 42 return graph 43
44 def get_path(graph,path = []): #得到满足条件的所有边
45 while graph: 46 max_weight = 0 47 for i in graph.keys(): 48 if graph[i] > max_weight: 49 max_weight = graph[i] 50 cur_path = i 51 path.append(cur_path) 52 graph = rm_path(graph,cur_path) 53 get_path(graph,path) 54 return path 55
56 def out_num(path,V): #计算某顶点的出度
57 count = 0 58 for i in path: 59 if i[0] == V: 60 count+=1
61 return count 62
63 def get_last_V(path,last_V = None): #得到最后一条边
64 index = 0 65 if last_V: #非随机寻找出度为0的顶点
66 for i in path: 67 if i[1] == last_V: 68 return i,index 69 else: 70 index+=1
71 return None #没有找到指向last_V的顶点(一条路径结束)
72 else: #随机寻找出度为0的顶点
73 for i in path: 74 if out_num(path,i[1]) == 0: 75 return i,index 76 else: 77 index+=1
78 return -1 #首尾相连
79
80
81 def assemble(cur_V,path,new_path = []): #给满足条件的边排序
82 while path: 83 path.pop(cur_V[1]) 84 new_path.insert(0,cur_V[0]) 85 cur_V = get_last_V(path,last_V = cur_V[0][0]) 86 if cur_V: 87 assemble(cur_V,path,new_path) 88 else: 89 cur_V = get_last_V(path) 90 assemble(cur_V,path,new_path) 91 return new_path 92
93 def align_isolated(path,sequence): #加入孤立顶点
94 new_path = reduce(lambda x,y:x+y,path) 95 for i in sequence: 96 if i not in new_path: 97 new_path.append(i) 98 return new_path 99
100 x = 'CCTTTTGG'
101 y = 'TTGGCAATCACT'
102 w = 'AGTATTGGCAATC'
103 u = 'ATGCAAACCT'
104 z = 'AATCGATG'
105 v = 'TCACTCCTTTT'
106 a = w 107 b = y 108 c = 'TCACTAGTA'
109 sequence = [x,y,w,u,z] 110 sequence1 = [a,b,c] 111 graph = dir_graph(sequence1,t=3) 112 print(graph) 113 path = get_path(graph) 114 path = [list(i) for i in path] #将path中的tuple元素换成list
115 #print(path)
116 start = get_last_V(path) #起始出度为0的顶点所在的边
117 if start == -1: #序列首尾相连
118 new_path = reduce(lambda x,y:x+y, path) 119 new_path = new_path[:-1] 120 result = reduce(print_result,new_path) 121 else: 122 new_path = assemble(start,path) #排序后的边
123 new_path = align_isolated(new_path,sequence1) #加入孤立顶点
124 #print(new_path)
125 result = reduce(print_result,new_path) #组装
126 #print(new_path)
127 print(result)