Did I find the right examples for you? yes no      Crawl my project      Python Jobs

All Samples(62)  |  Call(47)  |  Derive(1)  |  Import(14)
reader/writer for batches of Thrift messages stored in flat files.

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_local_storage.py   streamcorpus_pipeline(Download)
                message = _message_versions[self.config['streamcorpus_version']]
                logger.debug('reading from %r' % i_str)
                chunk = streamcorpus.Chunk(path=i_str, mode='rb', message=message)
                return chunk
            except IOError, exc:

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_taggers.py   streamcorpus_pipeline(Download)
 
import streamcorpus
from streamcorpus import Chunk, Tagging, Label, OffsetType, add_annotation
from streamcorpus_pipeline._clean_visible import make_clean_visible_file, \
    cleanse
def _aligner_core(t_path1, aligner, aligner_data):
    t_chunk1 = Chunk(t_path1, mode='rb')
    t_path2 = t_path1 + '-tmp-aligning'
    t_chunk2 = Chunk(t_path2, mode='wb')
    for si in t_chunk1:
 
        ## process the chunk's clean_visible data into xml
        i_chunk = Chunk(path=chunk_path, mode='rb')
        make_clean_visible_file(i_chunk, clean_visible_path)
 
        ## make a new output chunk at a temporary path
        tmp_chunk_path     = chunk_path + '_'
        o_chunk = Chunk(path=tmp_chunk_path, mode='wb')
 
        ## re-open i_chunk

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_lingpipe.py   streamcorpus_pipeline(Download)
from nltk.tokenize import WhitespaceTokenizer
 
from streamcorpus import Token, Sentence, EntityType, Chunk, Offset, \
    OffsetType, Gender, MentionType, Attribute, AttributeType
from streamcorpus_pipeline.stages import Configured
 
    elif args.action == 'align':
        i_chunk = Chunk(path=args.source_chunk, mode='rb')
        o_chunk = Chunk(path=args.output_file,  mode='wb')
        align_chunk_with_ner(args.input_file, i_chunk, o_chunk)  # pylint: disable=E0602

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_truncate.py   streamcorpus_pipeline(Download)
 
import os
from streamcorpus import Chunk
 
class truncate(object):
        ## make a new output chunk at a temporary path
        tmp_chunk_path = chunk_path + '_'
        t_chunk = Chunk(path=tmp_chunk_path, mode='wb')
 
        for num, si in enumerate(Chunk(path=chunk_path)):

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_s3_storage.py   streamcorpus_pipeline(Download)
 
import streamcorpus
from streamcorpus import decrypt_and_uncompress, compress_and_encrypt_path, Chunk
from streamcorpus_pipeline._exceptions import FailedExtraction
from streamcorpus_pipeline._get_name_info import get_name_info
                message = _message_versions[ self.config['streamcorpus_version'] ]
 
                return streamcorpus.Chunk(data=data, message=message)
 
            else:
            )
 
        logger.info( 'got back SIs: %d' % len( list( Chunk(data=data) ) ))
 
        rec_md5 = hashlib.md5(data).hexdigest() # pylint: disable=E1101

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_pipeline.py   streamcorpus_pipeline(Download)
                # TODO: make this EVEN LAZIER by not opening the t_chunk until inside _run_incremental_transforms whe the first output si is ready
                t_path = os.path.join(self.tmp_dir_path, 'trec-kba-pipeline-tmp-%s' % str(uuid.uuid4()))
                self.t_chunk = streamcorpus.Chunk(path=t_path, mode='wb')
            assert self.t_chunk.message == streamcorpus.StreamItem_v0_3_0, self.t_chunk.message
 
            t_path2 = os.path.join(self.tmp_dir_path, 'trec-kba-pipeline-tmp-%s' % str(uuid.uuid1()))
            # open destination for _run_incremental_transforms to write to
            self.t_chunk = streamcorpus.Chunk(path=t_path2, mode='wb')
 
            input_t_chunk = streamcorpus.Chunk(path=t_path, mode='rb')

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_get_name_info.py   streamcorpus_pipeline(Download)
import random
 
from streamcorpus import Chunk
 
def get_name_info(chunk_path, assert_one_date_hour=False, i_str=None):
    # calculation lazily, the name format might not even need that
    # value.
    ch = Chunk(path=chunk_path, mode='rb')
    date_hours = set()
    target_names = set()

src/s/t/streamcorpus_factorie-0.1.0/src/streamcorpus_factorie/_factorie.py   streamcorpus_factorie(Download)
import time
 
from streamcorpus import Chunk
from streamcorpus_pipeline.config import configure_logger
from streamcorpus_pipeline.stages import BatchTransform
        self.call_factorie(self.toFactoriePipeName, self.fromFactoriePipeName)
 
        self.pipeToFactorie = Chunk(path=self.toFactoriePipeName, mode='ab')
        self.pipeFromFactorie = Chunk(path=self.fromFactoriePipeName, mode='rb')
        self.taggedChunkIter = iter(self.pipeFromFactorie)

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_run_lingpipe.py   streamcorpus_pipeline(Download)
 
import streamcorpus
from streamcorpus import Chunk, Label, Tagging
#import lingpipe
lingpipe = None
    Chunk with body.ner
    '''
    o_chunk = Chunk()
    input_iter = i_chunk.__iter__()
    ner = ''
 
        ## just need one chunk for this tiny corpus
        i_chunk = Chunk(file_obj=open(fpath))
 
        ## prepare to make intermediate files in tmp_dir
 
    if args.align_only is not None:
        i_chunk = Chunk(file_obj=open(args.input_dir))
        tmp_ner_path = args.output_dir
        tmp_done_path = args.align_only

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_kvlayer.py   streamcorpus_pipeline(Download)
        def keys_and_values():
            for si in streamcorpus.Chunk(t_path):
                key1 = uuid.UUID(int=si.stream_time.epoch_ticks)
                key2 = uuid.UUID(hex=si.doc_id)
                data = streamcorpus.serialize(si)

  1 | 2 | 3  Next