#! /usr/bin/env python # # test_blast_xml.py # __author__ = "Kristian Rother" __copyright__ = "Copyright 2007-2012, The Cogent Project" __contributors__ = ["Micah Hamady"] __credits__ = ["Rob Knight"] __license__ = "GPL" __version__ = "1.5.3" __maintainer__ = "Kristian Rother" __email__ = "krother@rubor.de" __status__ = "Prototype" from cogent.util.unit_test import main, TestCase from cogent.parse.blast_xml import BlastXMLResult, MinimalBlastParser7,\ get_tag, parse_hsp, parse_hit, parse_header, parse_parameters,\ HSP_XML_FIELDNAMES, HIT_XML_FIELDNAMES import xml.dom.minidom class GetTagTests(TestCase): """Tests for the auxiliary function evaluating the tag objects.""" def setUp(self): self.single_tag = xml.dom.minidom.parseString(\ "<outer>bla <inner>content</inner>bla</outer>") self.double_tag = xml.dom.minidom.parseString(\ "<outer><inner>first content</inner><inner>second content</inner></outer>") self.empty_tag = xml.dom.minidom.parseString("<outer></outer>") def test_get_tag_works(self): self.assertEqual(get_tag(self.single_tag,'inner'),'content') self.assertEqual(get_tag(self.double_tag,'inner'),'first content') self.assertEqual(get_tag(self.empty_tag,'inner'),None) self.assertEqual(get_tag(self.empty_tag,'inner', 'blue elephant'),\ 'blue elephant') self.assertEqual(get_tag(self.single_tag,'non-existing tag'),None) self.assertEqual(get_tag(self.single_tag,'non-existing tag',\ 'pink elephant'),'pink elephant') self.assertEqual(get_tag(self.single_tag,'inner'),'content') def test_get_tag_fail(self): """Make sure the tag and name parameters are in the proper types.""" self.assertRaises(AttributeError, get_tag,None,"h1") self.assertRaises(AttributeError, get_tag,\ "<h1>This is not a XML tag object</h1>","h1") class MinimalBlastParser7Tests(TestCase): """Tests for the functions required by the Blast XML parsers.""" def setUp(self): self.hit1 = xml.dom.minidom.parseString(HIT_WITH_ONE_HSP) self.hit2 = xml.dom.minidom.parseString(HIT_WITH_TWO_HSPS) self.hsp1 = xml.dom.minidom.parseString(HSP_ONE) self.hsp2 = xml.dom.minidom.parseString(HSP_TWO) self.hsp_gaps = xml.dom.minidom.parseString(HSP_WITH_GAPS) self.param = xml.dom.minidom.parseString(PARAM_XML) self.header = xml.dom.minidom.parseString(HEADER_XML) self.complete = xml.dom.minidom.parseString(HEADER_COMPLETE) def test_parse_header(self): """Fields from XML header tag should be available as dict.""" data = parse_header(self.header) self.assertEqual(data.get('application'), 'my Grandma') self.assertEqual(data.get('version'), 'has') self.assertEqual(data.get('reference'), 'furry') self.assertEqual(data.get('query_letters'), 27) self.assertEqual(data.get('database'), 'Cats') def test_parse_parameters(self): """Fields from XML parameter tag should be available as dict.""" data = parse_parameters(self.param) self.assertEqual(data.get('matrix'), 'BLOSUM62') self.assertEqual(data.get('expect'), '10') self.assertEqual(data.get('gap_open_penalty'), 11.1) self.assertEqual(data.get('gap_extend_penalty'), 22.2) self.assertEqual(data.get('filter'), 'F') def test_parse_header_complete(self): """Fields from header+param tag should be available as dict.""" # try to process header with parameters etc in the XML data = parse_header(self.complete) self.assertEqual(data.get('database'), 'Cats') self.assertEqual(data.get('matrix'), 'BLOSUM62') def test_parse_hit(self): """Should return a list with all values for a hit+hsp.""" data = parse_hit(self.hit1) self.assertEqual(len(data),1) d = dict(zip(HIT_XML_FIELDNAMES,data[0])) self.assertEqual(d['SUBJECT_ID'],"gi|148670104|gb|EDL02051.1|") self.assertEqual(d['HIT_DEF'], "insulin-like growth factor 2 receptor, isoform CRA_c [Mus musculus]") self.assertEqual(d['HIT_ACCESSION'],"2001") self.assertEqual(int(d['HIT_LENGTH']),707) # check hit with more HSPs data = parse_hit(self.hit2) self.assertEqual(len(data),2) self.assertNotEqual(data[0],data[1]) def test_parse_hsp(self): """Should return list with all values for a hsp.""" data = parse_hsp(self.hsp1) d = dict(zip(HSP_XML_FIELDNAMES,data)) self.assertEqual(float(d['BIT_SCORE']),1023.46) self.assertEqual(float(d['SCORE']),2645) self.assertEqual(float(d['E_VALUE']),0.333) self.assertEqual(int(d['QUERY_START']),4) self.assertEqual(int(d['QUERY_END']),18) self.assertEqual(int(d['SUBJECT_START']),5) self.assertEqual(int(d['SUBJECT_END']),19) self.assertEqual(int(d['GAP_OPENINGS']),0) self.assertEqual(int(d['ALIGNMENT_LENGTH']),14) self.assertEqual(d['QUERY_ALIGN'],'ELEPHANTTHISISAHITTIGER') self.assertEqual(d['MIDLINE_ALIGN'],'ORCA-WHALE') self.assertEqual(d['SUBJECT_ALIGN'],'SEALSTHIS---HIT--GER') class BlastXmlResultTests(TestCase): """Tests parsing of output of Blast with output mode 7 (XML).""" def setUp(self): self.result = BlastXMLResult(COMPLETE_XML,xml=True) def test_options(self): """Constructor should take parser as an option.""" result = BlastXMLResult(COMPLETE_XML,parser=MinimalBlastParser7) self.assertEqual(len(result.keys()),1) # make sure whether normal Blast parser still works upon code merge! def test_parsed_query_sequence(self): """The result dict should have one query sequence as a key.""" # The full query sequence is not given in the XML file. # Thus it is not checked explicitly, only whether there is # exactly one found. self.assertEqual(len(self.result.keys()),1) def test_parsed_iterations(self): """The result should have the right number of iterations.""" n_iter = 0 for query_id,hits in self.result.iterHitsByQuery(): n_iter += 1 self.assertEqual(n_iter,1) def test_parsed_hsps(self): """The result should have the right number of hsps.""" n_hsps = 0 for query_id,hsps in self.result.iterHitsByQuery(): n_hsps += len(hsps) self.assertEqual(n_hsps,3) def test_parse_hit_details(self): """The result should have data from hit fields.""" for query in self.result: first_hsp = self.result[query][0][0] self.assertEqual(first_hsp['SUBJECT_ID'], "gi|148670104|gb|EDL02051.1|") self.assertEqual(first_hsp['HIT_DEF'], "insulin-like growth factor 2 receptor, isoform CRA_c [Mus musculus]") self.assertEqual(first_hsp['HIT_ACCESSION'],"2001") self.assertEqual(first_hsp['HIT_LENGTH'],707) def test_parse_hsp_details(self): """The result should have data from hsp fields.""" for query in self.result: # should check integers in next version. first_hsp = self.result[query][0][0] self.assertEqual(first_hsp['QUERY ID'],1) self.assertEqual(first_hsp['BIT_SCORE'],'1023.46') self.assertEqual(first_hsp['SCORE'],'2645') self.assertEqual(first_hsp['E_VALUE'],'0.333') self.assertEqual(first_hsp['QUERY_START'],'4') self.assertEqual(first_hsp['QUERY_END'],'18') self.assertEqual(first_hsp['QUERY_ALIGN'],'ELEPHANTTHISISAHITTIGER') self.assertEqual(first_hsp['MIDLINE_ALIGN'],'ORCA-WHALE') self.assertEqual(first_hsp['SUBJECT_ALIGN'],'SEALSTHIS---HIT--GER') self.assertEqual(first_hsp['SUBJECT_START'],'5') self.assertEqual(first_hsp['SUBJECT_END'],'19') self.assertEqual(first_hsp['PERCENT_IDENTITY'],'55') self.assertEqual(first_hsp['POSITIVE'],'555') self.assertEqual(first_hsp['GAP_OPENINGS'],0) self.assertEqual(first_hsp['ALIGNMENT_LENGTH'],'14') gap_hsp = self.result[query][0][1] self.assertEqual(gap_hsp['GAP_OPENINGS'],'33') HSP_XML = """ <Hsp> <Hsp_num>1</Hsp_num> <Hsp_bit-score>1023.46</Hsp_bit-score> <Hsp_score>2645</Hsp_score> <Hsp_evalue>0.333</Hsp_evalue> <Hsp_query-from>4</Hsp_query-from> <Hsp_query-to>18</Hsp_query-to> <Hsp_hit-from>5</Hsp_hit-from> <Hsp_hit-to>19</Hsp_hit-to> <Hsp_query-frame>1</Hsp_query-frame> <Hsp_hit-frame>1</Hsp_hit-frame> <Hsp_identity>55</Hsp_identity> %s <Hsp_positive>555</Hsp_positive> <Hsp_align-len>14</Hsp_align-len> <Hsp_qseq>ELEPHANTTHISISAHITTIGER</Hsp_qseq> <Hsp_hseq>SEALSTHIS---HIT--GER</Hsp_hseq> <Hsp_midline>ORCA-WHALE</Hsp_midline> </Hsp> """ HSP_ONE = HSP_XML%'' HSP_WITH_GAPS = HSP_XML%'<Hsp_gaps>33</Hsp_gaps>' HSP_TWO = """ <Hsp> <Hsp_num>2</Hsp_num> <Hsp_bit-score>1023.46</Hsp_bit-score> <Hsp_score>2645</Hsp_score> <Hsp_evalue>0.333</Hsp_evalue> <Hsp_query-from>6</Hsp_query-from> <Hsp_query-to>22</Hsp_query-to> <Hsp_hit-from>5</Hsp_hit-from> <Hsp_hit-to>23</Hsp_hit-to> <Hsp_query-frame>1</Hsp_query-frame> <Hsp_hit-frame>1</Hsp_hit-frame> <Hsp_identity>55</Hsp_identity> %s <Hsp_positive>555</Hsp_positive> <Hsp_align-len>18</Hsp_align-len> <Hsp_qseq>EPHANT---THISISAHIT-TIGER</Hsp_qseq> <Hsp_hseq>ALSWWWTHIS---HITW--GER</Hsp_hseq> <Hsp_midline>ORCA-WHALE</Hsp_midline> </Hsp> """ HIT_XML = """ <Hit> <Hit_num>1</Hit_num> <Hit_id>gi|148670104|gb|EDL02051.1|</Hit_id> <Hit_def>insulin-like growth factor 2 receptor, isoform CRA_c [Mus musculus]</Hit_def> <Hit_accession>2001</Hit_accession> <Hit_len>707</Hit_len> <Hit_hsps> %s </Hit_hsps> </Hit> """ HIT_WITH_ONE_HSP = HIT_XML%HSP_ONE HIT_WITH_TWO_HSPS = HIT_XML%(HSP_WITH_GAPS+HSP_TWO) PARAM_XML = """ <BlastOutput_param> <Parameters> <Parameters_matrix>BLOSUM62</Parameters_matrix> <Parameters_expect>10</Parameters_expect> <Parameters_gap-open>11.1</Parameters_gap-open> <Parameters_gap-extend>22.2</Parameters_gap-extend> <Parameters_filter>F</Parameters_filter> </Parameters> </BlastOutput_param> """ HEADER_XML = """ <BlastOutput> <BlastOutput_program>my Grandma</BlastOutput_program> <BlastOutput_version>has</BlastOutput_version> <BlastOutput_db>Cats</BlastOutput_db> <BlastOutput_reference>furry</BlastOutput_reference> <BlastOutput_query-len>27</BlastOutput_query-len> %s </BlastOutput> """ HIT_PREFIX = """ <BlastOutput_iterations> <Iteration> <Iteration_hits> """ HIT_SUFFIX = """ </Iteration_hits> </Iteration> </BlastOutput_iterations> """ HEADER_COMPLETE=HEADER_XML%(PARAM_XML+HIT_PREFIX+HIT_WITH_ONE_HSP+\ HIT_WITH_TWO_HSPS+HIT_SUFFIX) COMPLETE_XML = """<?xml version="1.0"?> <!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"> """+HEADER_COMPLETE if __name__ == '__main__': main()