mirror of https://github.com/exaloop/codon.git
274 lines
5.8 KiB
ReStructuredText
274 lines
5.8 KiB
ReStructuredText
Cookbook
|
|
========
|
|
|
|
.. contents::
|
|
|
|
Subsequence extraction
|
|
----------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
myseq = s'CAATAGAGACTAAGCATTAT'
|
|
sublen = 5
|
|
stride = 2
|
|
|
|
# explicit for-loop
|
|
for subseq in myseq.split(sublen, stride):
|
|
print(subseq)
|
|
|
|
# pipelined
|
|
myseq |> split(sublen, stride) |> print
|
|
|
|
k-mer extraction
|
|
----------------
|
|
|
|
.. code-block:: seq
|
|
|
|
myseq = s'CAATAGAGACTAAGCATTAT'
|
|
stride = 2
|
|
|
|
# explicit for-loop
|
|
for subseq in myseq.kmers(stride, k=5):
|
|
print(subseq)
|
|
|
|
# pipelined
|
|
myseq |> kmers(stride, k=5) |> print
|
|
|
|
Reverse complementation
|
|
-----------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
# sequences
|
|
s = s'GGATC'
|
|
print(~s) # GATCC
|
|
|
|
# k-mers
|
|
k = k'GGATC'
|
|
print(~k) # GATCC
|
|
|
|
k-mer Hamming distance
|
|
----------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
k1 = k'ACGTC'
|
|
k2 = k'ACTTA'
|
|
# ^ ^
|
|
print(abs(k1 - k2)) # Hamming distance = 2
|
|
|
|
k-mer Hamming neighbors
|
|
-----------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
def neighbors(kmer):
|
|
for i in range(len(kmer)):
|
|
for b in (k'A', k'C', k'G', k'T'):
|
|
if kmer[i] != b:
|
|
yield kmer |> base(i, b)
|
|
|
|
print(list(neighbors(k'AGC'))) # CGC, GGC, etc.
|
|
|
|
k-mer minimizer
|
|
---------------
|
|
|
|
.. code-block:: seq
|
|
|
|
def minimizer(s, k: Static[int]):
|
|
assert len(s) >= k
|
|
kmer_min = Kmer[k](s[:k])
|
|
for kmer in s[1:].kmers(k=k, step=1):
|
|
kmer = min(kmer, ~kmer)
|
|
if kmer < kmer_min: kmer_min = kmer
|
|
return kmer_min
|
|
|
|
print(minimizer(s'ACGTACGTACGT', 10))
|
|
|
|
de Bruijn edge
|
|
--------------
|
|
|
|
.. code-block:: seq
|
|
|
|
def de_bruijn_edge(a, b):
|
|
a = a |> base(0, k'A') # reset first base: [T]GAG -> [A]GAG
|
|
b = b >> s'A' # shift right to A: [GAG]C -> A[GAG]
|
|
return a == b # suffix of a == prefix of b
|
|
|
|
print(de_bruijn_edge(k'TGAG', k'GAGC')) # True
|
|
print(de_bruijn_edge(k'TCAG', k'GAGC')) # False
|
|
|
|
Count bases
|
|
-----------
|
|
|
|
.. code-block:: seq
|
|
|
|
@tuple
|
|
class BaseCount:
|
|
A: int
|
|
C: int
|
|
G: int
|
|
T: int
|
|
|
|
def __add__(self, other: BaseCount):
|
|
a1, c1, g1, t1 = self
|
|
a2, c2, g2, t2 = other
|
|
return (a1 + a2, c1 + c2, g1 + g2, t1 + t2)
|
|
|
|
def count_bases(s):
|
|
match s:
|
|
case 'A*': return count_bases(s[1:]) + (1,0,0,0)
|
|
case 'C*': return count_bases(s[1:]) + (0,1,0,0)
|
|
case 'G*': return count_bases(s[1:]) + (0,0,1,0)
|
|
case 'T*': return count_bases(s[1:]) + (0,0,0,1)
|
|
case _: return BaseCount(0,0,0,0)
|
|
|
|
print(count_bases(s'ACCGGGTTTT')) # (A: 1, C: 2, G: 3, T: 4)
|
|
|
|
Spaced seed search
|
|
------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
def has_spaced_acgt(s):
|
|
match s:
|
|
case 'A_C_G_T*':
|
|
return True
|
|
case t if len(t) >= 8:
|
|
return has_spaced_acgt(s[1:])
|
|
case _:
|
|
return False
|
|
|
|
print(has_spaced_acgt(s'AAATCTGTTAAA')) # True
|
|
print(has_spaced_acgt(s'ACGTACGTACGT')) # False
|
|
|
|
Reverse-complement palindrome
|
|
-----------------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
def is_own_revcomp(s):
|
|
match s:
|
|
case 'A*T' | 'T*A' | 'C*G' | 'G*C':
|
|
return is_own_revcomp(s[1:-1])
|
|
case '':
|
|
return True
|
|
case _:
|
|
return False
|
|
|
|
print(is_own_revcomp(s'ACGT')) # True
|
|
print(is_own_revcomp(s'ATTA')) # False
|
|
|
|
Sequence alignment
|
|
------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
# default parameters
|
|
s1 = s'CGCGAGTCTT'
|
|
s2 = s'CGCAGAGTT'
|
|
aln = s1 @ s2
|
|
print(aln.cigar, aln.score) # 3M1I6M -3
|
|
|
|
# custom parameters
|
|
# match = 2; mismatch = 4; gap1(k) = 2k + 4; gap2(k) = k + 13
|
|
aln = s1.align(s2, a=2, b=4, gapo=4, gape=2, gapo2=13, gape2=1)
|
|
print(aln.cigar, aln.score) # 3M1D3M2I2M 2
|
|
|
|
Reading FASTA/FASTQ
|
|
-------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
# iterate over everything
|
|
for r in FASTA('genome.fa'):
|
|
print(r.name)
|
|
print(r.seq)
|
|
|
|
# iterate over sequences
|
|
for s in FASTA('genome.fa') |> seqs:
|
|
print(s)
|
|
|
|
# iterate over everything
|
|
for r in FASTQ('reads.fq'):
|
|
print(r.name)
|
|
print(r.read)
|
|
print(r.qual)
|
|
|
|
# iterate over sequences
|
|
for s in FASTQ('reads.fq') |> seqs:
|
|
print(s)
|
|
|
|
Reading paired-end FASTQ
|
|
------------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
for r1, r2 in zip(FASTQ('reads_1.fq'), FASTQ('reads_2.fq')):
|
|
print(r1.name, r2.name)
|
|
print(r1.read, r2.read)
|
|
print(r1.qual, r2.qual)
|
|
|
|
Parallel FASTQ processing
|
|
-------------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
def process(s: seq):
|
|
...
|
|
|
|
# OMP_NUM_THREADS environment variable controls threads
|
|
FASTQ('reads.fq') |> iter ||> process
|
|
|
|
# Sometimes batching reads into blocks can improve performance,
|
|
# especially if each is quick to process.
|
|
FASTQ('reads.fq') |> blocks(size=1000) ||> iter |> process
|
|
|
|
Reading SAM/BAM/CRAM
|
|
--------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
# iterate over everything
|
|
for r in SAM('alignments.sam'):
|
|
print(r.name)
|
|
print(r.read)
|
|
print(r.pos)
|
|
print(r.mapq)
|
|
print(r.cigar)
|
|
print(r.reversed)
|
|
# etc.
|
|
|
|
for r in BAM('alignments.bam'):
|
|
# ...
|
|
|
|
for r in CRAM('alignments.cram'):
|
|
# ...
|
|
|
|
# iterate over sequences
|
|
for s in SAM('alignments.sam') |> seqs:
|
|
print(s)
|
|
|
|
for s in BAM('alignments.bam') |> seqs:
|
|
print(s)
|
|
|
|
for s in CRAM('alignments.cram') |> seqs:
|
|
print(s)
|
|
|
|
DNA to protein translation
|
|
--------------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
dna = s'AGGTCTAACGGC'
|
|
protein = dna |> translate
|
|
print(protein) # RSNG
|
|
|
|
Reading protein sequences from FASTA
|
|
------------------------------------
|
|
|
|
.. code-block:: seq
|
|
|
|
for s in pFASTA('seqs.fasta') |> seqs:
|
|
print(s)
|