Source code for bripipetools.parsing.illumina

import logging
import datetime

from .. import util

logger = logging.getLogger(__name__)


[docs]def get_flowcell_id(string): """ Return flowcell ID. :type string: str :param string: any string that might contain an Illumina flowcell ID (e.g., C6VG0ANXX) :rtype: str :return: the matching substring representing the flowcell ID or an empty string ('') if no match found """ return util.matchdefault('(?<=(_(A|B|D)))([A-Z]|[0-9])*X(X|Y|2|3)', string)
[docs]def parse_flowcell_run_id(run_id): """ Parse Illumina flowcell run ID (or folder name) and return individual components indicating date, instrument ID, run number, flowcell ID, and flowcell position. :type run_id: str :param run_id: string adhering to standard Illumina format (e.g., '150615_D00565_0087_AC6VG0ANXX') for a sequencing run :rtype: dict :return: a dict with fields for 'date', 'instrument_id', 'run_number', 'flowcell_id', and 'flowcell_position' """ id_parts = run_id.split('_') logger.debug("collecting the following parts from run ID {}: {}" .format(run_id, id_parts)) try: d = datetime.datetime.strptime(id_parts[0], '%y%m%d') date = datetime.date.isoformat(d) except ValueError: logger.warning("input string does not appear to contain a valid date") date = None try: instr_id = id_parts[1] except IndexError: logger.warning("input string does not contain a vaild instrument number") instr_id = None try: run_num = int(id_parts[2]) except IndexError: logger.warning("input string does not appear to contain a valid " "run number") run_num = None fc_id = util.matchdefault('(?<=(_(A|B|D)))([A-Z]|[0-9])*X(X|Y|2|3)', run_id) fc_pos = util.matchdefault('.{1}(?=%s)' % fc_id, run_id) return {'date': date, 'instrument_id': instr_id, 'run_number': run_num, 'flowcell_id': fc_id, 'flowcell_position': fc_pos}
[docs]def parse_fastq_filename(path): """ Parse standard Illumina FASTQ filename and return individual components indicating generic path, lane ID, read ID, and sample number. :type path: str :param path: full path to FASTQ file with filename adhering to standard Illumina format (e.g., '1D-HC29-C04_S27_L001_R1_001.fastq.gz') :rtype: dict :return: a dict with fields for 'path' (with root removed), 'lane_id', 'read_id', and 'sample_number' """ path = util.swap_root(path, 'bioinformatics', '/') # Note use of matchlastdefault here to accomodate new basespace dir structs lane_id = util.matchlastdefault('(?<=_|-)L00[1-8]', path) read_id = util.matchdefault('(?<=_)R[1-2]', path) sample_num_str = util.matchdefault('(?<=_S)[0-9]+', path) if sample_num_str == "": sample_num_str = "0" sample_num = int(sample_num_str) logger.debug("Found fastq file {} for lane {} with read {} and sample {}" .format(path, lane_id, read_id, sample_num)) return {'path': path, 'lane_id': lane_id, 'read_id': read_id, 'sample_number': sample_num}