Script to extract FASTA sequences

posted on 01:04 PM on Monday 17 November 2014

Wanted a very quick way to query a FASTA sequence and extract sub-sequences. Since samtools already does this, I could integrate with it via system calls but for large number of queries, this would be computational expensive. Checking the documentation for the index that samtools create for FASTA file, I realize that I could reuse the index using Python's memory mapped files. So I wrote a script to do this. The idea here is that this functionality can then be placed into Python script removing the need for expensive system calls. The script is as follows, it takes as the first parameter the FASTA filename and the rest of the parameters as regions in the format of chr:start-end or chr:position.

#!/usr/bin/env python

import sys
import mmap
import re

indexes = {}

# read the index file first
infile = open("%s.fai" % sys.argv[1])
for line in infile:
	line = line.strip()
	if line != "": 
		fields = line.split()
		if len(fields) != 5:
			print "Incorrect index line: %s" % line
			sys.exit(1)
		indexes[fields[0]] = {
			"length" : int(fields[1]),
			"offset" : int(fields[2]),
			"base_per_row" : int(fields[3]),
			"chr_per_row" : int(fields[4]), 
		}
infile.close()

def posToOffset(chr, pos):
	pos -= 1 # change to 0 based index
	offset = indexes[chr]["offset"] # use the chromosome offset first
	offset += pos # add the number of based
	offset += (pos / indexes[chr]["base_per_row"]) * (indexes[chr]["chr_per_row"] - indexes[chr]["base_per_row"]) # account for newlines
	#print chr, pos, indexes[chr]["offset"], offset
	return offset

with open(sys.argv[1], "r+b") as f:
	mm = mmap.mmap(f.fileno(), 0)
	for region in sys.argv[2:]:
	# range
	mo = re.search("^(\\w+)\\:(\\d+)\\-(\\d+)$", region)
	if mo != None:
		chr = mo.group(1)
		if chr in indexes:
			start = int(mo.group(2))
			end = int(mo.group(3))
			spos = posToOffset(chr, start)
			epos = posToOffset(chr, end+1)
			seq = mm[spos:epos].replace("\n", "")
			if len(seq) != (end-start+1):
				sys.stderr.write("Incorrect sequence length found for %s, %s\n" % (region, seq))
			else:
				print seq else: sys.stderr.write("Unknown chromosome %s\n" % chr)
	else:
		# single position
		mo = re.search("^(\\w+)\\:(\\d+)$", region)
		if mo != None:
			chr = mo.group(1)
			if chr in indexes:
				spos = posToOffset(chr, int(mo.group(2)))
				print mm[spos].replace("\n", "")
			else:
				sys.stderr.write("Unknown chromosome %s\n" % chr)
		else:
			sys.stderr.write("Unknown region %s\n" % region)

bernett.net