Skip to content

Instantly share code, notes, and snippets.

@shujishigenobu
Last active August 29, 2015 14:05
Show Gist options
  • Select an option

  • Save shujishigenobu/cd6e2974533c23973d7c to your computer and use it in GitHub Desktop.

Select an option

Save shujishigenobu/cd6e2974533c23973d7c to your computer and use it in GitHub Desktop.
require 'bio'
include Bio
#require 'pp'
#alias :p :pp
ref_proteome_file = ARGV[0]
# || "/home/DB/public/processed/OrthoDB/OrthoDB7/blastdb/DMELA.Drosophila_melanogaster.fas")
blast_res_file = ARGV[1]
# || "BLAST_DMELA/RsGM3.evm.out.combined.pep.vs.DMELA.Drosophila_melanogaster.fas.blastp.fmt7c.txt")
# query_file = ARGV[2] # query fasta
if ARGV[0] == "-h" || ARGV.size != 2
puts "Usage:
ruby gmodels_hit_coverage.rb reference_fasta blast_result
reference_fasta (fasta format)
blast_result (format 7)
"
exit
end
## print header of report
puts "#=== HIT COVERAGE STATISTICS ==="
puts "# "
puts "# date: #{Time.now}"
puts "# script: #{$0}"
puts "# user: #{ENV['USER']}@#{ENV['HOSTNAME']}"
puts "#"
puts "# Reference proteome: #{ref_proteome_file}"
puts "# BLAST result: #{blast_res_file}"
puts "#"
puts "#= Database side statistics "
## inspect reference proteome fasta
refp_total = 0 # aa
refp_ids = []
FlatFile.open(FastaFormat, ref_proteome_file).each do |fas|
refp_ids << fas.entry_id
refp_total += fas.seq.length
end
puts "num of reference proteins: #{refp_ids.size}"
puts "total length (aa): #{refp_total}"
## Parse blast (format 7 or 6)
## get top hit only
blast_res_top = {}
q_curr =nil
s_curr = nil
i = 0
queries_with_hits = []
File.open(blast_res_file).each do |l|
# puts l
next if /^\#/.match(l)
a = l.chomp.split(/\t/)
query = a[0]
subject = a[1]
unless blast_res_top.has_key?(query)
blast_res_top[query] = []
q_curr = query
s_curr = subject
queries_with_hits << query
end
if query == q_curr && subject == s_curr
blast_res_top[query] << a
end
i += 1
# break if i >100
end
## output in bed format
tmpf = "tmp.bed"
io = File.open(tmpf, "w")
blast_res_top.each do |q, hits|
hits.each do |h|
ary = [h[1],
h[8].to_i,
h[9].to_i,
h[0],
h[2].to_f,
"+",
]
io.puts ary.join("\t")
end
end
io.close
cmd = "sort -k1,1 -k2,2n tmp.bed > tmp.sorted.bed"
system cmd
outf = "merged.bed"
cmd = "mergeBed -i tmp.sorted.bed > #{outf}"
system cmd
data = {}
File.open(outf).each do |l|
a = l.chomp.split(/\t/)
key = a[0]
unless data.has_key?(key)
data[key] = {}
data[key]['hsps'] = []
end
data[key]['hsps'] << [a[1].to_i, a[2].to_i]
end
data.keys.sort.each do |k|
#p data[k]['hsps']
cov_len = data[k]['hsps'].map{|hsp| hsp[1] - hsp[0] + 1}.inject(0){|sum, i| sum + i}
data[k]['cov_len'] = cov_len
end
total_hit_cov = 0
data.keys.sort.each do |k|
total_hit_cov += data[k]['cov_len']
# puts [k,
# data[k]['cov_len']
# ].join("\t")
end
num_hit_genes = data.keys.size
puts sprintf("num of hits: %d (%.1f %%)", num_hit_genes, num_hit_genes / refp_ids.size.to_f * 100)
puts sprintf("hit coverage (aa): %d (%.1f %%)", total_hit_cov, total_hit_cov / refp_total.to_f * 100)
## Parse blast results again
## to inspect query side information
## blast output should be formated in format7
queries = []
File.open(blast_res_file).each do |l|
if m = /^\# Query:/.match(l)
qname = m.post_match.strip.split[0]
queries << qname
end
end
puts "#"
puts "#= Query side statistics "
puts sprintf("num of queries: %d", queries.size)
puts sprintf("num of queries with hits: %d (%.1f %%)", queries_with_hits.size, queries_with_hits.size / queries.size.to_f * 100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment