Source code for bripipetools.qc.sexcheck

"""
Class and methods to perform routine sex check on all processed libraries.
"""
import logging
import os
import csv

import pandas as pd

from .. import parsing
from . import SexPredictor, SexVerifier

logger = logging.getLogger(__name__)


[docs]class SexChecker(object): """ Reads gene counts for a processed library, maps genes to X and Y chromosomes, computes ratio of Y to X counts, gives predicted sex based on pre-defined rule. """ def __init__(self, processedlibrary, reference, workflowbatch_id, pipeline_root, db, run_opts): logger.debug("creating an instance of `SexChecker` for processed " "library '{}', workflow batch ID '{}', with " "pipeline root '{}'" .format(processedlibrary._id, workflowbatch_id, pipeline_root)) self.processedlibrary = processedlibrary self.reference = reference self.workflowbatch_id = workflowbatch_id self.pipeline_root = pipeline_root self.db = db self.run_opts = run_opts
[docs] def _load_x_genes(self, ref='grch38'): """ Read X chromosome gene IDs from file and return data frame. """ data_path = os.path.join( os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'data' ) x_genes_file = os.path.join(data_path, '{}_gene_ids_x.csv'.format(ref)) return pd.read_table(x_genes_file, names=['geneName'], skiprows=1)
[docs] def _load_y_genes(self, ref='grch38'): """ Read Y chromosome gene IDs from file and return data frame. """ data_path = os.path.join( os.path.dirname(os.path.dirname(os.path.realpath(__file__))), 'data' ) y_genes_file = os.path.join(data_path, '{}_gene_ids_y.csv'.format(ref)) return pd.read_table(y_genes_file, names=['geneName'], skiprows=1)
[docs] def _get_counts_path(self): """ Construct the absolute path to the counts file for the specified workflow batch. """ return [os.path.join(self.pipeline_root, d['outputs']['counts'][0]['file'].lstrip('/')) for d in self.processedlibrary.processed_data if d['workflowbatch_id'] == self.workflowbatch_id][0]
[docs] def _get_x_y_counts(self): """ Extract and store counts for X and Y genes; also store count total. """ counts_df = pd.read_table(self._get_counts_path(), names=['geneName', 'count']) logger.debug("counts data frame has {} rows".format(len(counts_df))) self.total_counts = sum(counts_df[counts_df['count'] > 0]['count']) y_counts = pd.merge(self._load_y_genes(ref=self.reference), counts_df, how='inner') self.y_counts = y_counts[y_counts['count'] > 0] logger.debug("detected {} Y gene(s)".format(len(self.y_counts))) x_counts = pd.merge(self._load_x_genes(ref=self.reference), counts_df, how='inner') self.x_counts = x_counts[x_counts['count'] > 0] logger.debug("detected {} X gene(s)".format(len(self.x_counts)))
[docs] def _compute_x_y_data(self): """ Collect and store X and Y gene/count data as well as total counts for the current processed library. """ self._get_x_y_counts() self.data = { 'x_genes': len(self.x_counts), 'y_genes': len(self.y_counts), 'x_counts': sum(self.x_counts['count']), 'y_counts': sum(self.y_counts['count']), 'total_counts': self.total_counts, }
[docs] def _predict_sex(self): """ Return predicted sex based on X/Y gene equation and cutoff. """ logger.debug("adding sex check QC info for '{}'" .format(self.processedlibrary._id)) self.data = SexPredictor(self.data, self.run_opts).predict()
[docs] def _verify_sex(self): """ Compare predicted sex to reported sex. """ logger.debug("verifying sex prediction") self.data = SexVerifier( data=self.data, processedlibrary=self.processedlibrary, db=self.db ).verify()
[docs] def _write_data(self): """ Save the sex validation data to a new file. """ counts_path = self._get_counts_path() project_path = os.path.dirname(os.path.dirname(counts_path)) output_filename = '{}_{}_sexcheck_validation.csv'.format( parsing.get_library_id(counts_path), parsing.get_flowcell_id(counts_path) ) output_dir = os.path.join(project_path, 'validation') output_path = os.path.join(output_dir, output_filename) logger.debug("saving sex check file '{}' to '{}'" .format(output_filename, output_dir)) if not os.path.isdir(output_dir): logger.debug("creating directory '{}'".format(output_dir)) os.makedirs(output_dir) with open(output_path, 'w') as f: writer = csv.DictWriter(f, fieldnames=list(self.data.keys())) writer.writeheader() writer.writerow(self.data) return output_path
[docs] def update(self): """ Add predicted sex validation field to processed library outputs and return processed library object. """ if self.reference != 'grch38': return self.processedlibrary processed_data = [d for d in self.processedlibrary.processed_data if d['workflowbatch_id'] == self.workflowbatch_id][0] logger.debug("predicting sex based on X and Y gene data") self._compute_x_y_data() self._predict_sex() self._verify_sex() self._write_data() processed_data.setdefault('validation', {})['sex_check'] = self.data return self.processedlibrary