• Facebook
  • Twitter
  • Reddit
  • StumbleUpon
  • Digg
  • email

from nltk.tokenizer import *
from features import *
from re import *
from nltk.corpus import *
 
#we have a list of closed-form words which are not counted in our tf.idf calculations
stop_list = []
sentence_tokenizer = RETokenizer(r'\!|\?|(\.(?!(\d|,)))\s*', negative=1)
word_tokenizer = WSTokenizer()
 
class TrainingSample:
    def __init__(self, abstract=[], text=[], filename=""):
 
        #if there is no filename, we have to process the text ourself
        if filename == "":
 
            #convert the list of paragraphs in to a list of tokenized paragraphs...
            p = 0
            self.text = []
            for p in range(0, len(text)):
                #...where a paragraph is a token, containing a list of tokenized sentences and a location
                self.text.append(Token(sentence_tokenizer.tokenize(text[p]), Location(p)))
 
            self.extract = []
            sentences = [s.type() for s in sentence_tokenizer.tokenize(abstract)]
            #for each sentence in the abstract            
            for sentence in sentences:
                print 'matching sentence:', sentence
                #record which sentence in the text (if any) matches most closely
                loc = closest_match(sentence, text)
                if loc != None:
                    (p, s) = loc
                    self.extract.append(Token(sentence, Location(s, unit="s", source=Location(p, unit="p"))))
 
            print 'created extract', self.extract
            print
 
        #they have given us a filename
        else:
 
            fp = open(filename)
 
            self.text = []            
            #loop through each paragraph
            text_length = int(fp.readline())
            for p in range(0, text_length):
                #read the paragraph, and drop the trailing \n
                paragraph = fp.readline()
                paragraph = paragraph[0:len(paragraph)-1]
                #add this paragraph in
                self.text.append(Token(sentence_tokenizer.tokenize(paragraph), Location(p)))
 
            self.extract = []
            #loop through each sentence in the abstract
            abstract_length = int(fp.readline())
            for i in range(0, abstract_length):
                sentence = fp.readline()
                sentence_no = int(fp.readline())
                paragraph_no = int(fp.readline())
                #add in the sentence, and the location of the corresponding sentence in the document
                self.extract.append(Token(sentence[0:len(sentence)-1], Location(sentence_no, unit="s",
                                                                                source=Location(paragraph_no, unit="p"))))
            fp.close()            
 
    #calculate the given features on this sample
    def train(self, features):
        results = dict()
        for feature in features:
            #calculate the feature on both the document and summary
            results[feature] = (feature.evaluate_document(self.text), feature.evaluate_summary(self.extract, self.text))
        return results
 
    def write(self, filename):
        fp = open(filename, 'w')
        fp.write(len(self.text).__repr__()+'\n')
        for paragraph in self.text:
            for sentence in paragraph.type():                
                fp.write(sentence.type()+'. ')
            fp.write('\n')
 
        fp.write(len(self.extract).__repr__()+'\n')
        for sentence in self.extract:
            fp.write(sentence.type()+'\n')            
            fp.write(sentence.loc().start().__repr__()+'\n')
            fp.write(sentence.loc().source().start().__repr__()+'\n')
        fp.close()
 
#Work out the closest match to the given sentence, out of all the sentences in the document
def closest_match(sentence1, document):
    #make a list of the words
    words = [token.type() for token in word_tokenizer.tokenize(sentence1)]
 
    #remove duplicates
    i=0
    while i<len(words):
        word = words[i]
        #if the current word is found later in the list, remove this occurrence (and try again)
        if word in words[i+1:len(words)]:
            words.remove(word)
        #otherwise, move on
        else:
            i+=1
 
    #remove words which occur in the list of closed-form words
    for word in words:
        if word in stop_list:
            words.remove(word)
 
    #calculate document frequency
    df = dict()
    #loop through each paragraph
    for paragraph in document:
        #and each sentence in the paragraph; note that we have to tokenize the paragraph into sentences
        for sentence2 in [s.type() for s in sentence_tokenizer.tokenize(paragraph)]:
            #go through each word in our list
            for word in words:
                #if this word occurs in the sentence
                if word in [token.type() for token in word_tokenizer.tokenize(sentence2)]:
                    #add the count to the document frequency
                    if word in df:
                        df[word] += 1
                    else:
                        df[word] = 1
 
    #calculate scores for each sentence
    score = dict()
    #go through each paragraph
    for paragraph_no in range(0, len(document)):
        current_paragraph = sentence_tokenizer.tokenize(document[paragraph_no])
        #and each sentence
        for sentence_no in range(0, len(current_paragraph)):
            current_sentence = current_paragraph[sentence_no]
            #for each word in the list
            for word in words:
                #if it occurs in this sentence
                if word in [token.type() for token in word_tokenizer.tokenize(current_sentence.type())]:
                    #add to the score for this sentence
                    if ((paragraph_no, sentence_no) in score):
                        score[(paragraph_no, sentence_no)] += 1/float(df[word])
                    else:
                        score[(paragraph_no, sentence_no)] = 1/float(df[word])
 
    #work out which sentence has the maximum score    
    max_score = 0
    max_sentence_loc = (0, 0)
    for sentence_loc in score:
        if score[sentence_loc] > max_score:
            max_score = score[sentence_loc]
            max_sentence_loc = sentence_loc
 
    #we need to calculate the maximum possible score, and only succeed if we have a certain fraction of that...
    max_possible_score = sum([1/float(df[word]) for word in df])
 
    if max_score < 0.75 * max_possible_score: return None
 
    return max_sentence_loc
 
#create the stop list
def create_stop_list():
 
    print 'creating stop list...'
    stop_list = []
    #we use the entire Brown corpus
    for entry in brown.items():
        print 'Processing entry:', entry
        for word in brown.tokenize(entry):
 
            #get the word type
            base = word.type().base().lower()
            #and the associated tag
            tag = word.type().tag()
            #if it is on our list of tags, and not already in the stop list
            if match(r'DT|CC|TO|WDT|WP.*|WRB.*|AT|EX.*|IN|MD.*', tag) and not base in stop_list:
                print 'found word:', base
                #then put it in
                stop_list.append(base)