Skip to content

Instantly share code, notes, and snippets.

@scienceystuff
Created December 17, 2018 18:55
Show Gist options
  • Select an option

  • Save scienceystuff/d5783a5367f9edc9c5e33742f3acfec4 to your computer and use it in GitHub Desktop.

Select an option

Save scienceystuff/d5783a5367f9edc9c5e33742f3acfec4 to your computer and use it in GitHub Desktop.
Samtools Tags in a python script, this will be updated as needed
# standard tags for samtools
# format is {name : (type, description)}
# where type is one of: character (A), array-nonspecific (B), real-number (f), hexadecimal-array (H), integer (i), string (Z), reserved (X)
# example usage:
# samtools_tags['BC']
# # outputs 'Barcode sequence identifying the sample'
# samtools_tags_examples['BC']
# # outputs 'ATCACG'
sm_type = {
'i' : 'integer',
'A' : 'character',
'B' : 'general-array',
'f' : 'real-number',
'Z' : 'string',
'X' : 'reserved'
}
samtools_tags = {
'AM' : (sm_type['i'], 'The smallest template-independent mapping quality in the template'),
'AS' : (sm_type['i'], 'Alignment score generated by aligner'),
'BC' : (sm_type['Z'], 'Barcode sequence identifying the sample'),
'BQ' : (sm_type['Z'], 'Offset to Base Alignment Quality BAQ'),
'BZ' : (sm_type['Z'], 'Phred quality of the unique molecular barcode bases in the OX tag'),
'CB' : (sm_type['Z'], 'Cell identifier'),
'CC' : (sm_type['Z'], 'Reference name of the next hit'),
'CG' : (sm_type['B'], 'CIGAR in BAM binary encoding iff it consists of > 65535 operators, BAM only'),
'CM' : (sm_type['i'], 'Edit distance between the color sequence and the color reference, see also NM'),
'CO' : (sm_type['Z'], 'Free-text comments'),
'CP' : (sm_type['i'], 'Leftmost coordinates of the next hit'),
'CQ' : (sm_type['Z'], 'Color read base qualities'),
'CR' : (sm_type['Z'], 'Uncorrected cellular barcode sequences'),
'CS' : (sm_type['Z'], 'Color read sequence'),
'CT' : (sm_type['Z'], 'Complete read annotation tag, used for consensus annotation dummy features'),
'CY' : (sm_type['Z'], 'Phred quality of the cellular barcode sequence in the CR tag'),
'E2' : (sm_type['Z'], 'The 2nd most likely base calls'),
'FI' : (sm_type['i'], 'The index of segmant in the template'),
'FS' : (sm_type['Z'], 'Segment suffix'),
'FZ' : (sm_type['B'], 'Flow signal intensity'),
'GC' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'GQ' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'GS' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'H0' : (sm_type['i'], 'Number of perfect hits'),
'H1' : (sm_type['i'], 'Number of 1-difference hits, see also NM'),
'H2' : (sm_type['i'], 'Number of 2-difference hits'),
'HI' : (sm_type['i'], 'Query hit index'),
'IH' : (sm_type['i'], 'Query hit total count'),
'LB' : (sm_type['Z'], 'Library'),
'MC' : (sm_type['Z'], 'CIGAR string for mate or next segment'),
'MD' : (sm_type['Z'], 'String for mismatching positions'),
'MF' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'MI' : (sm_type['Z'], 'Molecular identifier and a string that uniquely identifies the molecule from which the record was derived'),
'MQ' : (sm_type['i'], 'Mapping quality of the mate or next segment'),
'NH' : (sm_type['i'], 'Number of reported alignments that contain the query in the current record'),
'NM' : (sm_type['i'], 'Edit distance to the reference'),
'OC' : (sm_type['Z'], 'Original CIGAR'),
'OP' : (sm_type['i'], 'Original mapping position'),
'OQ' : (sm_type['Z'], 'Original base quality'),
'OX' : (sm_type['Z'], 'Original unique molecular barcode bases'),
'PG' : (sm_type['Z'], 'Program'),
'PQ' : (sm_type['i'], 'Phred likelihood of the template'),
'PT' : (sm_type['Z'], 'Read annotations for parts of the padded read sequence'),
'PU' : (sm_type['Z'], 'Platform unit'),
'Q2' : (sm_type['Z'], 'Phred quality of the mate or next segment sequence in the R2 tag'),
'QT' : (sm_type['Z'], 'Phred quality of the sample barcode sequence in the BC tag'),
'QX' : (sm_type['Z'], 'Quality score of the unique molecular identifier in the RX tag'),
'R2' : (sm_type['Z'], 'Sequence of the mate or next segment in the template'),
'RG' : (sm_type['Z'], 'Read group'),
'RT' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'RX' : (sm_type['Z'], 'Sequence bases of the unique molecular identifier that may be corrected'),
'S2' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'SA' : (sm_type['Z'], 'Other canonical alignments in a chimeric alignment'),
'SM' : (sm_type['i'], 'Template-independent mapping quality'),
'SQ' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'TC' : (sm_type['i'], 'The number of segments in the template'),
'U2' : (sm_type['Z'], 'Phred probability of the 2nd call being wrong conditional on the last being wrong'),
'UQ' : (sm_type['i'], 'Phred likelihood of the segment that is conditional on the mapping being correct'),
'X?' : (sm_type['X'], 'Reserved for end users, second value may be [0-9,Aa-Zz]'),
'Y?' : (sm_type['X'], 'Reserved for end users, second value may be [0-9,Aa-Zz]'),
'Z?' : (sm_type['X'], 'Reserved for end users, second value may be [0-9,Aa-Zz]')
}
samtools_tags_examples = {
'AM' : (sm_type['i'], 'NA'),
'AS' : (sm_type['i'], 'NA'),
'BC' : (sm_type['Z'], 'ATCACG'),
'BQ' : (sm_type['Z'], 'NA'),
'BZ' : (sm_type['Z'], 'NA'),
'CB' : (sm_type['Z'], 'NA'),
'CC' : (sm_type['Z'], 'NA'),
'CG' : (sm_type['B'], 'NA'),
'CM' : (sm_type['i'], 'NA'),
'CO' : (sm_type['Z'], 'The cake is a lie'),
'CP' : (sm_type['i'], 'NA'),
'CQ' : (sm_type['Z'], 'NA'),
'CR' : (sm_type['Z'], 'NA'),
'CS' : (sm_type['Z'], 'NA'),
'CT' : (sm_type['Z'], 'NA'),
'CY' : (sm_type['Z'], 'NA'),
'E2' : (sm_type['Z'], 'NA'),
'FI' : (sm_type['i'], 'NA'),
'FS' : (sm_type['Z'], 'NA'),
'FZ' : (sm_type['B'], 'NA'),
'GC' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'GQ' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'GS' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'H0' : (sm_type['i'], 'NA'),
'H1' : (sm_type['i'], 'NA'),
'H2' : (sm_type['i'], 'NA'),
'HI' : (sm_type['i'], 'NA'),
'IH' : (sm_type['i'], 'NA'),
'LB' : (sm_type['Z'], 'NA'),
'MC' : (sm_type['Z'], 'NA'),
'MD' : (sm_type['Z'], 'NA'),
'MF' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'MI' : (sm_type['Z'], 'NA'),
'MQ' : (sm_type['i'], 'NA'),
'NH' : (sm_type['i'], 'NA'),
'NM' : (sm_type['i'], 'NA'),
'OC' : (sm_type['Z'], 'NA'),
'OP' : (sm_type['i'], 'NA'),
'OQ' : (sm_type['Z'], 'NA'),
'OX' : (sm_type['Z'], 'NA'),
'PG' : (sm_type['Z'], 'Illumina'),
'PQ' : (sm_type['i'], 'NA'),
'PT' : (sm_type['Z'], 'NA'),
'PU' : (sm_type['Z'], 'Nextera'),
'Q2' : (sm_type['Z'], 'NA'),
'QT' : (sm_type['Z'], 'NA'),
'QX' : (sm_type['Z'], 'NA'),
'R2' : (sm_type['Z'], 'NA'),
'RG' : (sm_type['Z'], 'RG1'),
'RT' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'RX' : (sm_type['Z'], 'NA'),
'S2' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'SA' : (sm_type['Z'], 'NA'),
'SM' : (sm_type['i'], 'NA'),
'SQ' : (sm_type['X'], 'Reserved for backwards compatibility reasons'),
'TC' : (sm_type['i'], 'NA'),
'U2' : (sm_type['Z'], 'NA'),
'UQ' : (sm_type['i'], ''),
'X?' : (sm_type['X'], 'Xi'),
'Y?' : (sm_type['X'], 'Yi'),
'Z?' : (sm_type['X'], 'Zi')
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment