""" IO.py AIGO is a python library for the Analysis and Inter-comparison of Gene Ontology functional annotations. see (http://code.google.com/p/aigo). Created by Michael Defoin-Platel on 21/02/2010. Copyright (c) 2010. All rights reserved. AIGO is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. """ from pylab import * import os, csv from itertools import izip from AIGO import logger from AIGO.utils.File import checkForZip, readFile IOType={"GO Annotation File":"GAF", "Blast2GO":"B2G", "Affymetrix":"AFFY", "ArrayIDer":"AID", "Mapping GPid GOids": "GP2GO"} #------------------------------------------------------- def extract_COPSA(fileName, G, refSet=None): """ Read a functional annotation from the COPSA pipeline including provenance information and COPSA internal weights Gene_ID sep1 GO_ID,GO_ID sep1 [Prov, Prov] sep1 w1 sep1 w2 .... w16 """ fileName= str(checkForZip(fileName)) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") hasRef=None if refSet: hasRef=dict(izip(refSet, refSet)) GenetoGO, GOtoGene , = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] =dict(), dict() AnnotationtoProvenance, ProvenancetoAnnotation = dict(), dict() for aspect in G.aspect: AnnotationtoProvenance[aspect], ProvenancetoAnnotation[aspect] =dict(), dict() f=open(fileName, "r") import re for row in f: data=row.replace("\r", "").replace("\n", "").split("\t") g=data[0] go=data[1].split(",") #provenance=data[2].split(",") provenance=re.split('\],\ +\[|^\[|\]$', data[2])[1:-1] weights=data[3:] if not hasRef is None and not hasRef.has_key(g): logger.handleWarning("gene product %s is not in the reference set, skip it " % g) continue for term, prov in zip(go, provenance): term = term.strip(); prov = ','.join(sort(prov.replace(" ", "").split(','))) #prov = ','.join(prov.replace(" ", "").split(',')) if not term.find('GO:')==0: continue #Get the alternative term if any and its GO aspect term, aspect=G.get_GOAlternative(term, nameSpace=True) if not aspect: logger.handleWarning("%s: term %s is not in GO graph, skip it " % (g, term)) continue GenetoGO[aspect].setdefault(g, set([])).add(term) GOtoGene[aspect].setdefault(term, set([])).add(g) AnnotationtoProvenance[aspect].setdefault((g,term), []).append(prov) ProvenancetoAnnotation[aspect].setdefault(prov, set([])).add((g,term)) f.close() return GenetoGO, GOtoGene, AnnotationtoProvenance, ProvenancetoAnnotation def extract_GP2GO(fileName, G, refSet=None, sep1='\t', sep2=',', comments='#', skiprows=0): """ Read a functional annotation mapping file of the form Gene_ID sep GO_ID,GO_ID sep is \t by default """ fileName= str(checkForZip(fileName)) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") hasRef=None if refSet: hasRef=dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] =dict(), dict() data= loadtxt(fileName, dtype="S", usecols=[0,1], delimiter=sep1, comments=comments, skiprows=skiprows) for g,go in data: if not hasRef is None and not hasRef.has_key(g): logger.handleWarning("gene product %s is not in the reference set, skip it " % g) continue for term in go.split(sep2): term = term.strip(); if not term.find('GO:')==0: continue #Get the alternative term if any and its GO aspect term, aspect=G.get_GOAlternative(term, nameSpace=True) if not aspect: logger.handleWarning("%s: term %s is not in GO graph, skip it " % (g, term)) continue GenetoGO[aspect].setdefault(g, set([])).add(term) GOtoGene[aspect].setdefault(term, set([])).add(g) return GenetoGO, GOtoGene def extract_GO2GP(fileName, G, refSet=None, sep1='\t', sep2=',', comments='#', skiprows=0): """ Read a functional annotation mapping file of the form GO_ID sep Gene_ID,Gene_ID sep is \t by default """ fileName= str(checkForZip(fileName)) if (not os.path.exists(fileName)): raise IOError(fileName + " does not exist and is required ") hasRef=None if refSet: hasRef=dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] =dict(), dict() data= loadtxt(fileName, dtype="S", usecols=[0,1], delimiter=sep1, comments=comments, skiprows=skiprows) for go,GP in data: if not go.find('GO:')==0: continue #Get the alternative term if any and its GO aspect term, aspect=G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % (term)) continue for gp in GP.split(sep2): gp = gp.strip(); if not hasRef is None and not hasRef.has_key(gp): logger.handleWarning("gene product %s is not in the reference set, skip it " % gp) continue GenetoGO[aspect].setdefault(gp, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(gp) return GenetoGO, GOtoGene def extract_AID(fileName, G, refSet=None): fileName= checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName+" does not exist and is required ") hasRef=None if refSet: hasRef=dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] =dict(), dict() rd=csv.reader(readFile(fileName), delimiter="\t") header=rd.next() for row in rd: #Old format... #g=row[header.index('ProbesetID')] g=row[header.index('Probeset_id')] go=row[header.index('GO:ID')] if hasRef and not hasRef.has_key(g): logger.handleWarning("gene product %s is not in the reference set, skip it " % g) continue if go.find('GO:')==0: #Get the alternative term if any and its GO aspect go, aspect=G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % go) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go,set([])).add(g) return GenetoGO, GOtoGene #Affymetrix TAF format #http://www.affymetrix.com/support/technical/manual/taf_manual.affx def extract_Affy(fileName, G, refSet=None, GO_columns=[30, 31, 32], filetype="Affy", delimiter=',', quoting=csv.QUOTE_ALL): fileName= checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName+" does not exist and is required ") #sniff and seek dialect csvfile = readFile(fileName) hasRef=None if refSet: hasRef=dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] =dict(), dict() #Skip comments row=csvfile.readline() while row[0] == '#': row=csvfile.readline() #Read Header header=row #rd=list(csv.reader(f)) csv.register_dialect('format', delimiter=delimiter, quoting=quoting) rd=csv.reader(csvfile, dialect='format') for row in rd: #Read gene product id g=row[0] if hasRef and not hasRef.has_key(g): logger.handleWarning("gene product %s is not in the reference set, skip it " % g) continue for aspect, i in zip(['biological_process', 'cellular_component', 'molecular_function'], GO_columns): for item in row[i].split('///'): if not item=="---" and not len(item.strip())==0: go="GO:%07d" % int(item.split('//')[0].replace('/','')) go, aspect=G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % go) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go, set([])).add(g) return GenetoGO, GOtoGene def readGAF_2(fileName): GAF_col=["DB","DB Object ID","DB Object Symbol","Qualifier", "GO ID","DB:Reference","Evidence Code","With (or) From", "Aspect","DB Object Name","DB Object Synonym","DB Object Type", "Taxon(|taxon)","Date","Assigned By","Annotation Extension","Gene Product Form ID"] #Read the entire file data=[row for row in csv.reader(readFile(fileName), delimiter="\t")] #Read the header seek=0 GAF_OK=False while data[seek][0][0]=="!": if data[seek][0]=="!gaf-version: 2.0": GAF_OK=True seek=seek+1 if not GAF_OK: raise Exception("Sorry, GAF format version 2.0 expected.") return iter(data[seek:]), GAF_col def extract_GAF(fileName, G, refSet=None): fileName= checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName+" does not exist and is required ") hasRef=None if refSet: refRef=dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] =dict(), dict() data,GAF_col = readGAF_2(fileName) for row in data: #g=row[GAF_col.index("DB Object Symbol")] g=".".join([row[GAF_col.index("Taxon(|taxon)")][6:],row[GAF_col.index("DB Object Symbol")]]) go=row[GAF_col.index('GO ID')] if not row[GAF_col.index('Qualifier')].find('NOT')==-1: logger.handleWarning("go term %s for gene product %s is qualified as NOT: ignored" % (go, g)) continue if hasRef and not hasRef.has_key(g): logger.handleWarning("gene product %s is not in the reference set, skip it " % g) continue if go.find('GO:')==0: go, aspect=G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % go) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go,set([])).add(g) return GenetoGO, GOtoGene def extract_SCOP(fileName, G, refSet=None): fileName= checkForZip(fileName) if (not os.path.exists(fileName)): raise IOError(fileName+" does not exist and is required ") hasRef=None if refSet: hasRef=dict(izip(refSet, refSet)) GenetoGO, GOtoGene = dict(), dict() for aspect in G.aspect: GenetoGO[aspect], GOtoGene[aspect] =dict(), dict() rd=csv.reader(readFile(fileName), delimiter=";") header=rd.next() for row in rd: #Read gene product id g=row[0] g=row[header.index('domScop')] go=row[header.index('termGo')] if hasRef and not hasRef.has_key(g): logger.handleWarning("gene product %s is not in the reference set, skip it " % g) continue if go.find('GO:')==0: #Get the alternative term if any and its GO aspect go, aspect=G.get_GOAlternative(go, nameSpace=True) if not aspect: logger.handleWarning("term %s is not in GO graph, skip it " % term) continue GenetoGO[aspect].setdefault(g, set([])).add(go) GOtoGene[aspect].setdefault(go,set([])).add(g) return GenetoGO, GOtoGene