import os import pickle import constants from web_page import WebPage from naive_bayes import NaiveBayes def load_html_files(): """ HTMLファイルがあるディレクトリにいる前提で使う """ pages = [] for i in range(constants.NUM_OF_FETCHED_PAGES): with open('%s_%s.html' % (constants.QUERY, str(i)), 'r') as f: page = WebPage() page.html_body = f.read() page.remove_html_tags() pages.append(page) return pages if __name__ == '__main__': # もういちど別の場所で使うのなら関数にする if not os.path.exists(constants.FETCHED_PAGES_DIR_NAME): os.mkdir(constants.FETCHED_PAGES_DIR_NAME) os.chdir(constants.FETCHED_PAGES_DIR_NAME) pages = load_html_files() pkl_nb_path = os.path.join('..', constants.NB_PKL_FILENAME) # もしすでにNaiveBayesオブジェクトをpickle保存していたらそれを学習させる if os.path.exists(pkl_nb_path): with open(pkl_nb_path, 'rb') as f: nb = pickle.load(f) else: nb = NaiveBayes() for page in pages: nb.train(page.html_body, constants.QUERY) # せっかく学習させたんだから保存しよう with open(pkl_nb_path, 'wb') as f: pickle.dump(nb, f)