# Copyright 2001 by Katharine Lindner. All Rights Reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.
 
from __future__ import print_function
 
import warnings
from Bio import BiopythonDeprecationWarning
 
warnings.warn("The module Bio.UniGene.UniGene is now deprecated, "
              "and will be removed in a future release of Biopython."
              "To parse UniGene flat files, please use the parser in "
              "Bio.UniGene instead",
              BiopythonDeprecationWarning)
 
import string
import sgmllib
import UserDict
import Bio.File
 
 
class UniGeneParser( sgmllib.SGMLParser ):
 
    def reset( self ):
        sgmllib.SGMLParser.reset( self )
        self.text = ''
        self.queue = UserDict.UserDict()
        self.open_tag_stack = []
        self.open_tag = 'open_html'
        self.key_waiting = ''
        self.master_key = ''
        self.context = 'general_info'
 
    def parse( self, handle ):
        self.reset()
        self.feed( handle )
        for key in self.queue:
            if( self.queue[ key ] == {} ):
                if( key[ :15 ] == 'UniGene Cluster' ):
                    self.queue[ 'UniGene Cluster' ] = key[ 16: ]
                del self.queue[ key ]
        return self.queue
 
#
# Assumes an empty line between records
#
    def feed( self, handle ):
        if isinstance(handle, Bio.File.UndoHandle):
            uhandle = handle
        else:
            uhandle = Bio.File.UndoHandle(handle)
        text = ''
        while True:
            line = uhandle.readline()
            line = string.strip( line )
            if( line == '' ):
                break
            text = text + ' ' + line
 
        sgmllib.SGMLParser.feed( self, text )
 
    def handle_data(self, newtext ):
        newtext = string.strip( newtext )
        self.text = self.text + newtext
 
    def start_a( self, attrs ):
        if( self.context == 'seq_info' ):
            if( self.open_tag != 'open_b' ):
                self.text = ''
 
#        self.queue.append( attrs )
 
    def end_a( self ):
        if( self.context == 'seq_info' ):
            if( self.open_tag != 'open_b' ):
                if( self.key_waiting == '' ):
                    self.key_waiting = self.text
                    self.text = ''
 
    def start_b( self, attrs ):
 
        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_b'
        if( self.key_waiting == '' ):
            self.text = ''
 
    def end_b( self ):
        if( self.text[ :15 ] == 'UniGene Cluster' ):
            self.queue[ 'UniGene Cluster' ] = self.text[ 16: ]
            self.text = ''
        elif( self.key_waiting == '' ):
            self.extract_key()
 
    def extract_key( self ):
        text = string.strip( self.text )
        key = string.join( string.split( text ) )
        words = string.split( key )
        key = string.join( words[ :2 ] )
        self.text = ''
 
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        if( self.open_tag == 'open_table_data' ):
            if( self.context == 'general_info' ):
                if( self.key_waiting == '' ):
                    self.key_waiting = key
                    self.text = ''
            elif( self.context == 'seq_info' ):
                if( text == 'Key to Symbols' ):
                    self.context = 'legend'
                    self.master_key = key
        elif( self.context == 'general_info' ):
            self.master_key = key
            if 'SEQUENCE' in key:
                self.context = 'seq_info'
            self.queue[ key ] = UserDict.UserDict()
        elif( self.context == 'seq_info' ):
            self.queue[ key ] = UserDict.UserDict()
            self.master_key = key
 
    def start_table( self, attrs ):
        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_table'
 
    def end_table( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        self.key_waiting = ''
 
    def start_tr( self, attrs ):
        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_table_row'
        self.text = ''
 
    def end_tr( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        text = self.text
        if text:
            self.text = ''
            if( text[ 0 ] == ':' ):
                text = text[ 1: ]
            text = string.join( string.split( text ) )
            if self.context == 'general_info' or \
               self.context == 'seq_info':
                try:
                    contents = self.queue[ self.master_key ][ self.key_waiting ]
                    if isinstance(contents, list):
                        contents.append( text )
                    else:
                        self.queue[ self.master_key ][ self.key_waiting ] = \
                            [ contents, text ]
                except:
                    self.queue[ self.master_key ][ self.key_waiting ] = text
 
                self.key_waiting = ''
 
    def start_td( self, attrs ):
        self.open_tag_stack.append( self.open_tag )
        self.open_tag = 'open_table_data'
 
    def end_td( self ):
        try:
            self.open_tag = self.open_tag_stack.pop()
        except:
            self.open_tag = 'open_html'
        if( self.context == 'seq_info' ):
            self.text = self.text + ' '
 
    def print_item( self, item, level = 1 ):
        indent = '    '
        for j in range( 0, level ):
            indent = indent + '    '
        if isinstance(item, str):
            if( item != '' ):
                print('%s%s' % ( indent, item ))
        elif isinstance(item, list):
            for subitem in item:
                self.print_item( subitem, level + 1 )
        elif( isinstance( item, UserDict.UserDict ) ):
            for subitem in item:
                print('%skey is %s' % ( indent, subitem ))
                self.print_item( item[ subitem ], level + 1 )
        else:
            print(item)
 
    def print_tags( self ):
        for key in self.queue:
            print('key %s' % key)
            self.print_item( self.queue[ key ] )
 
 
if( __name__ == '__main__' ):
    with open( 'Hs13225.htm') as handle:
        undo_handle = Bio.File.UndoHandle( handle )
        unigene_parser = UniGeneParser()
        unigene_parser.parse( handle )
        unigene_parser.print_tags()