Did I find the right examples for you? yes no      Crawl my project      Python Jobs

All Samples(7)  |  Call(6)  |  Derive(0)  |  Import(1)
It calculates some stats for the given seqs.

        def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False,
                             nxs=None):
    'It calculates some stats for the given seqs.'
    # get data
    lengths = IntCounter()
    quals_per_pos = IntBoxplot()
    nucl_freq = NuclFreqsPlot()
    kmer_counter = KmerCounter(kmer_size) if kmer_size else None
    dustscores = IntCounter()
    for seq in seqs:
        lengths[get_length(seq)] += 1
        try:
            quals = get_int_qualities(seq)
        except AttributeError:
            quals = []
        for index, qual in enumerate(quals):
            quals_per_pos.append(index + 1, qual)
        str_seq = get_str_seq(seq)
        for index, nucl in enumerate(str_seq):
            nucl_freq.append(index, nucl)
        if kmer_counter is not None:
            kmer_counter.count_seq(str_seq)
        if do_dust_stats:
            dustscore = calculate_dust_score(seq)
            if dustscore is not None:
                dustscores[int(dustscore)] += 1

    lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'})

    # length distribution
    lengths_srt = 'Length stats and distribution.\n'
    lengths_srt += '------------------------------\n'
    nxs = sorted(nxs) if nxs else []
    for nx in sorted(nxs):
        lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx))
    lengths_srt += str(lengths)
    lengths_srt += '\n'

    # agregate quals
    if quals_per_pos:
        quals = quals_per_pos.aggregated_array
        quals.update_labels({'sum': None, 'items': 'tot. base pairs'})

        q30 = quals.count_relative_to_value(30, operator.ge) / quals.count
        q30 *= 100

        q20 = quals.count_relative_to_value(20, operator.ge) / quals.count
        q20 *= 100

        # qual distribution
        qual_str = 'Quality stats and distribution.\n'
        qual_str += '-------------------------------\n'
        qual_str += 'Q20: {:.2f}\n'.format(q20)
        qual_str += 'Q30: {:.2f}\n'.format(q30)
        qual_str += str(quals)
        qual_str += '\n'

        # qual per position boxplot
        qual_boxplot = 'Boxplot for quality per position.\n'
        qual_boxplot += '---------------------------------\n'
        qual_boxplot += quals_per_pos.ascii_plot
        qual_boxplot += '\n'
    else:
        qual_str = ''
        qual_boxplot = ''

    # nucl freqs
    freq_str = 'Nucleotide frequency per position.\n'
    freq_str += '----------------------------------\n'
    freq_str += nucl_freq.ascii_plot
    freq_str += '\n'

    # kmer_distriubution
    kmer_str = ''
    if kmer_counter is not None:
        kmers = IntCounter(kmer_counter.values)
        if kmers:
            kmers.update_labels({'sum': None, 'items': 'num. kmers'})
            kmer_str = 'Kmer distribution\n'
            kmer_str += '-----------------\n'
            kmer_str += str(kmers)
            kmer_str += '\n'
            kmer_str += 'Most common kmers:\n'
            for kmer, number in kmer_counter.most_common(20):
                kmer_str += '\t{}: {}\n'.format(kmer, number)

    dust_str = ''
    if dustscores:
        dustscores.update_labels({'sum': None, 'items': 'num. seqs.'})
        dust_str = 'Dustscores stats and distribution.\n'
        dust_str += '----------------------------------\n'
        dust7 = (dustscores.count_relative_to_value(7, operator.gt) /
                 dustscores.count)
        dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7)
        dust_str += str(dustscores)
        dust_str += '\n'

    return {'length': lengths_srt,
            'quality': qual_str,
            'nucl_freq': freq_str,
            'qual_boxplot': qual_boxplot,
            'kmer': kmer_str,
            'dustscore': dust_str}
        


src/s/e/seq_crumbs-HEAD/test/test_statistics.py   seq_crumbs(Download)
from Bio.Seq import Seq
 
from crumbs.statistics import (IntCounter, draw_histogram, IntBoxplot,
                               calculate_sequence_stats, NuclFreqsPlot,
                               KmerCounter, calculate_dust_score,
            in_fhands.append(fhand)
        seqs = read_seqs(in_fhands, prefered_seq_classes=[SEQRECORD])
        results = calculate_sequence_stats(seqs, nxs=[50])
        assert 'maximum: 4' in results['length']
        assert 'N50' in results['length']
        infhands = [open(join(TEST_DATA_DIR, 'arabidopsis_genes'))]
        seqs = list(read_seqs(infhands, prefered_seq_classes=[SEQRECORD]))
        kmers = calculate_sequence_stats(seqs)['kmer']
        assert not 'Kmer distribution' in kmers
 
        kmers = calculate_sequence_stats(seqs, kmer_size=3)['kmer']
 
        # dust
        dust = calculate_sequence_stats(seqs)['dustscore']
        assert not dust
        dust = calculate_sequence_stats(seqs, do_dust_stats=True)['dustscore']