Did I find the right examples for you? yes no      Crawl my project      Python Jobs

All Samples(31)  |  Call(24)  |  Derive(0)  |  Import(7)

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_upgrade_streamcorpus.py   streamcorpus_pipeline(Download)
import logging
 
from streamcorpus import make_stream_item, make_stream_time, ContentItem, Tagging, Rating, Target, Annotator
from streamcorpus_pipeline.stages import Configured
 
 
        s2.body = ContentItem(
            raw = s1.body.raw,
            encoding = s1.body.encoding,
            ## default, might get overwritten below
        if s1.title:
            ci = ContentItem(
                raw = s1.title.raw,
                encoding = s1.title.encoding,
                clean_visible = s1.title.cleansed,
                )
            s2.other_content['title'] = ci
        if s1.anchor:
            ci = ContentItem(
                raw = s1.anchor.raw,

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_convert_kba_json.py   streamcorpus_pipeline(Download)
import json
 
from streamcorpus import make_stream_item, ContentItem
from streamcorpus_pipeline.stages import Configured
 
 
        stream_item.body = ContentItem(
            raw = b''.join(['<p>', anchor, '</p>',
                            '<p>', title, '</p>',
                            body]),
        if title:
            stream_item.other_content['title']  = ContentItem(
                raw = title,
                media_type = 'text/html',
                encoding = 'UTF-8',
                )
 
        if anchor:
            stream_item.other_content['anchor']  = ContentItem(
                raw = anchor,

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_spinn3r_feed_storage.py   streamcorpus_pipeline(Download)
        si.other_content['extract'] = _make_content_item(pe.content_extract)
    si.other_content['title'] = streamcorpus.ContentItem(
        raw=pe.title.encode('utf8'),
        media_type=pe.content_extract.mime_type,
        encoding='UTF-8')
    si.other_content['feed_entry_title'] = streamcorpus.ContentItem(
        raw=entry.feed_entry.title.encode('utf8'),
        mime_type = node.mime_type
    raw = raw.decode('utf8').encode('utf8')
    return streamcorpus.ContentItem(raw=raw, media_type=mime_type)
 

src/s/t/streamcorpus_pipeline-0.5.23.dev1/examples/john_smith_custom_reader.py   streamcorpus_pipeline(Download)
 
        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        raw_string = open(os.path.join(dir_path, fname)).read()
        ## We know that this is already clean and has nothing

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_serifxml.py   streamcorpus_pipeline(Download)
        )
        body = streamcorpus.ContentItem(
            raw=raw,
            taggings={
                self.config['tagger_id']: tagging,

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_title.py   streamcorpus_pipeline(Download)
import re
 
from streamcorpus import ContentItem
from streamcorpus_pipeline.stages import Configured
from streamcorpus_pipeline import _exceptions
def add_content_item(stream_item, title_m):
    title = whitespace_re.sub(' ', title_m.group('title')).strip()
    if len(title) > 60:
        title = title[:60] + '...'
    stream_item.other_content['title'] = ContentItem(clean_visible=title)

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_upgrade_streamcorpus_v0_3_0.py   streamcorpus_pipeline(Download)
            setattr(si3, attr, copy.deepcopy(getattr(si, attr)))
 
        si3.body = streamcorpus.ContentItem()
 
        for name, ci in si.other_content.items():
            ci3 = streamcorpus.ContentItem()

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_john_smith.py   streamcorpus_pipeline(Download)
 
            ## build a ContentItem for the body
            body = streamcorpus.ContentItem()
            raw_string = open(os.path.join(dir_path, fname)).read()
            ## We know that this is already clean and has nothing

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_yaml_files_list.py   streamcorpus_pipeline(Download)
 
        ## build a ContentItem for the body
        body = streamcorpus.ContentItem()
        body.media_type = magic.from_file(path, mime=True)
 

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/tests/test_clean_html.py   streamcorpus_pipeline(Download)
import pytest
 
from streamcorpus import StreamItem, ContentItem
import streamcorpus_pipeline
from streamcorpus_pipeline._clean_html import make_clean_html, clean_html
        'all_domains': True,
    })
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    stage( si, context )
    with open(os.path.join(path, 'nytimes-index.html'), 'r') as f:
        raw = f.read().decode('utf8')
    si = StreamItem(body=ContentItem(raw=raw, media_type='text/html'))
    si = stage(si, {})
 

  1 | 2  Next