#!/usr/bin/env python """Unit tests for locuslink-specific classes """ from cogent.parse.locuslink import ll_start,LLFinder,pipes,first_pipe,commas, \ _read_accession, _read_rell, _read_accnum, \ _read_map, _read_sts, _read_comp, _read_grif, _read_pmid, _read_go, \ _read_extannot, _read_cdd, _read_contig, LocusLink, LinesToLocusLink from cogent.util.unit_test import TestCase, main __author__ = "Rob Knight" __copyright__ = "Copyright 2007-2012, The Cogent Project" __credits__ = ["Rob Knight"] __license__ = "GPL" __version__ = "1.5.3-dev" __maintainer__ = "Rob Knight" __email__ = "rob@spot.colorado.edu" __status__ = "Production" class locuslinkTests(TestCase): """Tests toplevel functions.""" def test_read_accession(self): """_read_accession should perform correct conversions""" self.assertEqual(_read_accession('NP_035835|6755985|na\n'), \ {'Accession':'NP_035835','Gi':'6755985','Strain':'na'}) #check that it ignores additional fields self.assertEqual(_read_accession('NG_002740|30172554|na|1|1315\n'), \ {'Accession':'NG_002740','Gi':'30172554','Strain':'na'}) def test_read_rell(self): """_read_rell should perform correct conversions""" self.assertEqual(_read_rell(\ 'related mRNA|AK090391|n|NM_153775--AK090391\n'), \ {'Description':'related mRNA','Id':'AK090391','IdType':'n',\ 'Printable':'NM_153775--AK090391'}) def test_read_accnum(self): """_read_accnum should perform correct conversions""" self.assertEqual(_read_accnum('NG_002740|30172554|na|1|1315\n'), \ {'Accession':'NG_002740','Gi':'30172554','Strain':'na',\ 'Start':'1','End':'1315'}) def test_read_map(self): """_read_map should perform correct conversions""" self.assertEqual(_read_map('10 C1|RefSeq|C|\n'), \ {'Location':'10 C1', 'Source':'RefSeq','Type':'C'}) def test_read_sts(self): """_read_sts should perform correct conversions""" self.assertEqual(_read_sts('RH35858|2|37920|na|seq_map|epcr\n'), \ {'Name':'RH35858','Chromosome':'2','StsId':'37920', 'Segment':'na',\ 'SequenceKnown':'seq_map', 'Evidence':'epcr'}) def test_read_cdd(self): """_read_cdd should perform correct conversions""" self.assertEqual(_read_cdd(\ 'Immunoglobulin C-2 Type|smart00408|103|na|4.388540e+01\n'), {'Name':'Immunoglobulin C-2 Type','Key':'smart00408',\ 'Score':'103', 'EValue':'na', 'BitScore':'4.388540e+01'}) def test_read_comp(self): """_read_comp should perform correct conversions""" self.assertEqual(_read_comp(\ '10090|Map2k6|11|11 cM|26399|17|MAP2K6|ncbi_mgd\n'), \ {'TaxonId':'10090', 'Symbol':'Map2k6', 'Chromosome':'11', \ 'Position':'11 cM', 'LocusId':'26399', 'ChromosomeSelf':'17', \ 'SymbolSelf':'MAP2K6','MapName':'ncbi_mgd'}) def test_read_grif(self): """_read_grif should perform correct conversions""" self.assertEqual(_read_grif('12037672|interaction with pRb\n'), \ {'PubMedId':'12037672', 'Description':'interaction with pRb'}) def test_read_pmid(self): """_read_pmid should perform correct conversions""" self.assertEqual(_read_pmid('12875969,12817023,12743034\n'), \ ['12875969','12817023','12743034']) def test_read_go(self): """_read_go should perform correct conversions""" self.assertEqual(_read_go(\ 'molecular function|zinc ion binding|IEA|GO:0008270|GOA|na\n'), \ {'Category':'molecular function', 'Term':'zinc ion binding',\ 'EvidenceCode':'IEA','GoId':'GO:0008270','Source':'GOA', \ 'PubMedId':'na'}) def test_read_extannot(self): """_read_extannot should perform correct conversions""" self.assertEqual(_read_extannot(\ 'cellular role|Pol II transcription|NR|Proteome|8760285\n'), \ {'Category':'cellular role','Term':'Pol II transcription',\ 'EvidenceCode':'NR', 'Source':'Proteome', 'PubMedId':'8760285'}) def test_read_contig(self): """_read_contig should perform correct conversions""" self.assertEqual(_read_contig(\ 'NT_011109.15|29800594|na|31124734|31133047|-|19|reference\n'),\ {'Accession':'NT_011109.15','Gi':'29800594','Strain':'na',\ 'From':'31124734','To':'31133047','Orientation':'-',\ 'Chromosome':'19','Assembly':'reference'}) def test_LinesToLocusLink(self): """LinesToLocusLink should give expected results on sample data""" fake_file = \ """>>1 LOCUSID: 1 LOCUS_CONFIRMED: yes LOCUS_TYPE: gene with protein product, function known or inferred ORGANISM: Homo sapiens STATUS: REVIEWED NM: NM_130786|21071029|na NP: NP_570602|21071030 CDD: Immunoglobulin C-2 Type|smart00408|103|na|4.388540e+01 PRODUCT: alpha 1B-glycoprotein ASSEMBLY: AF414429,AK055885,AK056201 CONTIG: NT_011109.15|29800594|na|31124734|31133047|-|19|reference EVID: supported by alignment with mRNA XM: NM_130786|21071029|na XP: NP_570602|21071030|na ACCNUM: AC010642|9929687|na|43581|41119 TYPE: g ACCNUM: AF414429|15778555|na|na|na TYPE: m PROT: AAL07469|15778556 ACCNUM: AK055885|16550723|na|na|na TYPE: m ACCNUM: AK056201|16551539|na|na|na TYPE: m ACCNUM: BC035719|23273475|na|na|na TYPE: m PROT: AAH35719|23273476 ACCNUM: none|na|na|na|na TYPE: p PROT: P04217|23503038 OFFICIAL_SYMBOL: A1BG OFFICIAL_GENE_NAME: alpha-1-B glycoprotein ALIAS_SYMBOL: A1B ALIAS_SYMBOL: ABG ALIAS_SYMBOL: GAB PREFERRED_PRODUCT: alpha 1B-glycoprotein SUMMARY: Summary: The protein encoded by this gene is a plasma glycoprotein of unknown function. The protein shows sequence similarity to the variable regions of some immunoglobulin supergene family member proteins. CHR: 19 STS: RH65092|-|10673|na|na|epcr STS: WI-16009|-|52209|na|na|epcr STS: G59506|-|136670|na|na|epcr COMP: 10090|A1bg|na|na|117586|19|A1BG|ncbi_mgd COMP: 10090|A1bg|7|7 cM|117586|19|A1BG|ncbi_mgd BUTTON: unigene.gif LINK: http://www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=Hs&CID=390608 UNIGENE: Hs.390608 OMIM: 138670 MAP: 19q13.4|RefSeq|C| MAPLINK: default_human_gene|A1BG BUTTON: snp.gif LINK: http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?locusId=1 BUTTON: homol.gif LINK: http://www.ncbi.nlm.nih.gov/HomoloGene/homolquery.cgi?TEXT=1[loc]&TAXID=9606 BUTTON: ensembl.gif LINK: http://www.ensembl.org/Homo_sapiens/contigview?geneid=NM_130786 BUTTON: ucsc.gif LINK: http://genome.ucsc.edu/cgi-bin/hgTracks?org=human&position=NM_130786 BUTTON: mgc.gif LINK: http://mgc.nci.nih.gov/Genes/GeneInfo?ORG=Hs&CID=390608 PMID: 12477932,8889549,3458201,2591067 GO: molecular function|molecular_function unknown|ND|GO:0005554|GOA|3458201 GO: biological process|biological_process unknown|ND|GO:0000004|GOA|na GO: cellular component|extracellular|IDA|GO:0005576|GOA|3458201 >>386590 LOCUSID: 386590 LOCUS_CONFIRMED: yes LOCUS_TYPE: gene with protein product, function known or inferred ORGANISM: Danio rerio ACCNUM: AF510108|31323727|na|na|na TYPE: m PROT: AAP47138|31323728 OFFICIAL_SYMBOL: tra1 OFFICIAL_GENE_NAME: tumor rejection antigen (gp96) 1 BUTTON: zfin.gif LINK: http://zfin.org/cgi-bin/ZFIN_jump?record=ZDB-GENE-031002-1 PMID: 14499652""" records = list(LLFinder(fake_file.split('\n'))) self.assertEqual(len(records), 2) first, second = map(LinesToLocusLink, records) #test the second one first, since it's shorter self.assertEqual(second.LOCUSID, 386590) self.assertEqual(second.LOCUS_CONFIRMED, 'yes') self.assertEqual(second.LOCUS_TYPE, \ 'gene with protein product, function known or inferred') self.assertEqual(second.ORGANISM, 'Danio rerio') self.assertEqual(second.ACCNUM, [{'Accession':'AF510108', \ 'Gi':'31323727', 'Strain':'na','Start':'na','End':'na'}]) self.assertEqual(second.TYPE, ['m']) self.assertEqual(second.PROT, \ [{'Accession':'AAP47138','Gi':'31323728'}]) self.assertEqual(second.OFFICIAL_SYMBOL, 'tra1') self.assertEqual(second.OFFICIAL_GENE_NAME, \ 'tumor rejection antigen (gp96) 1') self.assertEqual(second.BUTTON, ['zfin.gif']) self.assertEqual(second.LINK, \ ['http://zfin.org/cgi-bin/ZFIN_jump?record=ZDB-GENE-031002-1']) self.assertEqual(second.PMID, ['14499652']) #now for the annoying test on the longer record self.assertEqual(first.LOCUSID, 1) self.assertEqual(first.LOCUS_CONFIRMED, 'yes') self.assertEqual(first.ORGANISM, 'Homo sapiens') self.assertEqual(first.LOCUS_TYPE, \ 'gene with protein product, function known or inferred') self.assertEqual(first.STATUS, 'REVIEWED') self.assertEqual(first.NM, [{'Accession':'NM_130786','Gi':'21071029', \ 'Strain':'na'}]) self.assertEqual(first.NP, [{'Accession':'NP_570602','Gi':'21071030'}]) self.assertEqual(first.CDD, [{'Name':'Immunoglobulin C-2 Type',\ 'Key':'smart00408','Score':'103', 'EValue':'na',\ 'BitScore':'4.388540e+01'}]) self.assertEqual(first.PRODUCT, ['alpha 1B-glycoprotein']) self.assertEqual(first.ASSEMBLY, [['AF414429','AK055885','AK056201']]) self.assertEqual(first.CONTIG, [{'Accession':'NT_011109.15',\ 'Gi':'29800594','Strain':'na', 'From':'31124734','To':'31133047',\ 'Orientation':'-','Chromosome':'19','Assembly':'reference'}]) self.assertEqual(first.EVID, ['supported by alignment with mRNA']) self.assertEqual(first.XM, [{'Accession':'NM_130786', 'Gi':'21071029', \ 'Strain':'na'}]) self.assertEqual(first.XP, [{'Accession':'NP_570602', 'Gi':'21071030', \ 'Strain':'na'}]) self.assertEqual(first.ACCNUM, [ \ {'Accession':'AC010642','Gi':'9929687','Strain':'na',\ 'Start':'43581', 'End':'41119'}, {'Accession':'AF414429','Gi':'15778555','Strain':'na',\ 'Start':'na', 'End':'na'}, {'Accession':'AK055885','Gi':'16550723','Strain':'na',\ 'Start':'na', 'End':'na'}, {'Accession':'AK056201','Gi':'16551539','Strain':'na',\ 'Start':'na', 'End':'na'}, {'Accession':'BC035719','Gi':'23273475','Strain':'na',\ 'Start':'na', 'End':'na'}, {'Accession':'none','Gi':'na','Strain':'na',\ 'Start':'na', 'End':'na'}, ]) self.assertEqual(first.TYPE, ['g','m','m','m','m','p']) self.assertEqual(first.PROT, [ \ {'Accession':'AAL07469', 'Gi':'15778556'}, {'Accession':'AAH35719', 'Gi':'23273476'}, {'Accession':'P04217', 'Gi':'23503038'}, ]) self.assertEqual(first.OFFICIAL_SYMBOL, 'A1BG') self.assertEqual(first.OFFICIAL_GENE_NAME, 'alpha-1-B glycoprotein') self.assertEqual(first.ALIAS_SYMBOL, ['A1B','ABG','GAB']) self.assertEqual(first.PREFERRED_PRODUCT, ['alpha 1B-glycoprotein']) self.assertEqual(first.SUMMARY, ["""Summary: The protein encoded by this gene is a plasma glycoprotein of unknown function. The protein shows sequence similarity to the variable regions of some immunoglobulin supergene family member proteins."""]) self.assertEqual(first.CHR, ['19']) self.assertEqual(first.STS, [ {'Name':'RH65092','Chromosome':'-','StsId':'10673','Segment':'na',\ 'SequenceKnown':'na','Evidence':'epcr'}, {'Name':'WI-16009','Chromosome':'-','StsId':'52209','Segment':'na',\ 'SequenceKnown':'na','Evidence':'epcr'}, {'Name':'G59506','Chromosome':'-','StsId':'136670','Segment':'na',\ 'SequenceKnown':'na','Evidence':'epcr'}, ]) self.assertEqual(first.COMP, [ {'TaxonId':'10090','Symbol':'A1bg','Chromosome':'na','Position':'na',\ 'LocusId':'117586', 'ChromosomeSelf':'19','SymbolSelf':'A1BG',\ 'MapName':'ncbi_mgd'}, {'TaxonId':'10090','Symbol':'A1bg','Chromosome':'7','Position':'7 cM',\ 'LocusId':'117586', 'ChromosomeSelf':'19','SymbolSelf':'A1BG',\ 'MapName':'ncbi_mgd'}, ]) self.assertEqual(first.BUTTON, ['unigene.gif','snp.gif','homol.gif', \ 'ensembl.gif', 'ucsc.gif', 'mgc.gif']) self.assertEqual(first.LINK, [ \ 'http://www.ncbi.nlm.nih.gov/UniGene/clust.cgi?ORG=Hs&CID=390608', 'http://www.ncbi.nlm.nih.gov/SNP/snp_ref.cgi?locusId=1', 'http://www.ncbi.nlm.nih.gov/HomoloGene/homolquery.cgi?TEXT=1[loc]&TAXID=9606', 'http://www.ensembl.org/Homo_sapiens/contigview?geneid=NM_130786', 'http://genome.ucsc.edu/cgi-bin/hgTracks?org=human&position=NM_130786', 'http://mgc.nci.nih.gov/Genes/GeneInfo?ORG=Hs&CID=390608', ]) self.assertEqual(first.UNIGENE, ['Hs.390608']) self.assertEqual(first.OMIM, ['138670']) self.assertEqual(first.MAP, [{'Location':'19q13.4','Source':'RefSeq',\ 'Type':'C'}]) self.assertEqual(first.MAPLINK, ['default_human_gene|A1BG']) self.assertEqual(first.PMID, ['12477932','8889549','3458201','2591067']) self.assertEqual(first.GO, [ \ {'Category':'molecular function','Term':'molecular_function unknown',\ 'EvidenceCode':'ND','GoId':'GO:0005554','Source':'GOA',\ 'PubMedId':'3458201'}, {'Category':'biological process','Term':'biological_process unknown',\ 'EvidenceCode':'ND','GoId':'GO:0000004','Source':'GOA',\ 'PubMedId':'na'}, {'Category':'cellular component','Term':'extracellular',\ 'EvidenceCode':'IDA','GoId':'GO:0005576','Source':'GOA',\ 'PubMedId':'3458201'}, ]) if __name__ == '__main__': main()