Overlap Graphs
http://rosalind.info/problems/grph/
基于图论的基本设定,二代测序的组装就是基于kmer的延伸
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
import sys
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--input','-i',
type=str,
required=True,
help='input file in fasta format')
args = parser.parse_args()
def read_fasta(input):
with open(input) as read_fasta:
fasta = {}
for line in read_fasta:
line = line.strip()
if line[0] == '>':
header = line[1:]
else:
sequence = line
fasta[header] = fasta.get(header, '') + sequence
return fasta
#取得序列的suffix
def suffix(str):
sub = []
for i in range(len(str)-3,len(str)):
sub.append(str[i])
return sub
#取得序列的prefix
def prefix(str):
sub = []
for i in range(3):
sub.append(str[i])
return sub
fasta = read_fasta(args.input)
#print(len(fasta.keys()))
for i in fasta.keys():
for k in fasta.keys():
if suffix(fasta[i]) == prefix(fasta[k]) and i != k:
print("{} {}".format(i,k))
|
他山之石
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
#!/usr/bin/env python
from collections import defaultdict
from rosalind import readfasta
def overlap(alist):
prefixes = defaultdict(list)
for name, code in alist:
prefixes[code[:3]].append(name)
for name, code in alist:
suffix = code[-3:]
for other in prefixes[suffix]:
if other != name:
print name, other
overlap(readfasta())
|
Finding a Protein Motif
http://rosalind.info/problems/mprt/
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
import requests
with open("rosalind_mprt.txt") as f:
ID_list = f.readlines()
output = {}
for pepID in ID_list:
pepID = pepID.strip()
pep_fasta = requests.get("https://www.uniprot.org/uniprot/" + pepID + ".fasta").text
#print(test)
lines = pep_fasta.splitlines()
#fasta = {}
body = ''
for line in lines:
if line[0] == '>':
header = line[1:]
else:
body += line
print(body)
local = []
for i in range(len(body)-4):
kmer = body[i:i+4]
if kmer[0] == 'N' \
and kmer[1] != 'P' \
and kmer[3] != 'P' \
and (kmer[2] == "S" or kmer[2] == "T"):
local.append(str(i+1))
if local != []:
output[pepID] = local
# for i in output.keys():
# print(i + '\n' + ' '.join(output[i]))
with open("rosalind_mprt.out",'w') as o:
for i in output.keys():
o.write(i + '\n' + ' '.join(output[i]) + '\n')
|
他山之石
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
|
from re import finditer
from sys import argv
from urllib.request import urlopen
uniprot = "http://www.uniprot.org/uniprot/%s.fasta"
for protein in open(argv[1], 'r').read().strip().splitlines():
# Fetch the protein's fasta file and get rid of newlines.'
f = urlopen(uniprot % protein).read().decode('utf-8')
f = ''.join(f.splitlines()[1:])
# Find all the positions of the N-glycosylation motif.
locs = [g.start()+1 for g in finditer(r'(?=N[^P][ST][^P])', f)]
# Print them out, if any.
if locs != []:
print(protein)
print(' '.join(map(str, locs)))
|
Consensus and Profile
http://rosalind.info/problems/cons/
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
|
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--input','-i',
type=str,
required=True,
help="input file in fasta format with equal length strings")
parser.add_argument('--output','-o',
type=str,
default="rosalind_cons.out",
help="output file")
args = parser.parse_args()
def read_fasta(input):
fasta = {}
with open(input) as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line[0] == '>':
seqID = line[1:]
else:
fasta[seqID] = fasta.get(seqID, '') + line
return fasta
def get_line(N):
for i in fasta.keys():
seq_length = len(fasta[i])
list_a = [0 for x in range(seq_length)]
for i in fasta.keys():
for k in range(seq_length):
if fasta[i][k] == str(N):
list_a[k] += 1
return list_a
fasta = read_fasta(args.input)
bases = ['A','C','G','T']
seq_length = len(get_line('A'))
consensus = ['']*seq_length
for m in range(seq_length):
count_max = 0
for i in bases:
if get_line(i)[m] > count_max:
count_max = get_line(i)[m]
consensus[m] = i
with open(args.output, 'w') as o:
o.write(''.join(consensus) + '\n')
for i in bases:
list_N = get_line(i)
with open(args.output, 'a') as o:
o.write("{}: {}\n".format(i, ' '.join([str(i) for i in list_N])))
|
他山之石
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
strands = [x.strip() for x in f.readlines()]
matrix = zip(*strands)
profile_matrix = map(lambda x: dict((base, x.count(base)) for base in "ACGT"), matrix)
consensus = [max(x,key = x.get) for x in profile_matrix]
print "".join(consensus);
for base in "ACGT":
print base+":",
for x in profile_matrix:
print x[base],
print ""
|
总结
需要学习的函数:
1
2
3
4
5
6
|
#正则库中的finditer()
#from sys import argv 可简便接受输入项
#for循环接受列表
map()
zip()
f.write('output.file','a') #追加输出
|
参考来源