Did I find the right examples for you? yes no      Crawl my project      Python Jobs

All Samples(7)  |  Call(5)  |  Derive(0)  |  Import(2)

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/_serifxml.py   streamcorpus_pipeline(Download)
        )
        si = streamcorpus.StreamItem(
            version=streamcorpus.Versions.v0_3_0,
            doc_id=doc_id,
            abs_url=fname,

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/tests/test_clean_html.py   streamcorpus_pipeline(Download)
import pytest
 
from streamcorpus import StreamItem, ContentItem
import streamcorpus_pipeline
from streamcorpus_pipeline._clean_html import make_clean_html, clean_html
        'all_domains': True,
    })
    si = StreamItem(body=ContentItem(clean_html=html))
    context = {}
    stage( si, context )
    with open(os.path.join(path, 'nytimes-index.html'), 'r') as f:
        raw = f.read().decode('utf8')
    si = StreamItem(body=ContentItem(raw=raw, media_type='text/html'))
    si = stage(si, {})
 

src/s/t/streamcorpus_pipeline-0.5.23.dev1/streamcorpus_pipeline/tests/test_hyperlink_labels.py   streamcorpus_pipeline(Download)
import pytest
 
from streamcorpus import make_stream_item, StreamItem, ContentItem, OffsetType, Chunk
import streamcorpus_pipeline
from streamcorpus_pipeline._clean_visible import clean_visible
def test_speed(parser_type, test_data_dir):
    stream_items = []
    for i in xrange(10):
        stream_item = StreamItem()
        stream_item.body = ContentItem()
def test_long_doc(parser_type, test_data_dir):
    stream_item = StreamItem()
    stream_item.body = ContentItem()
    path = os.path.join(test_data_dir, 'test' )
    stream_item.body.clean_html = open(