Source code for bripipetools.io.picardmetrics

"""
Class for reading and parsing Picard metrics files.
"""
import logging
import re

from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)


[docs]class PicardMetricsFile(object):
    """
    Parser to read tables of metrics generated by one of several Picard tools,
    typically stored in an HTML file, and return as a parsed and formatted
    dictionary.
    """
    def __init__(self, path):
        self.path = path
        self.data = {}

[docs]    def _read_file(self):
        """
        Read file into raw HTML string.
        """
        logger.debug("reading file '{}' to raw HTML string".format(self.path))
        with open(self.path) as f:
            self.data['raw'] = f.read()

[docs]    def _get_table(self):
        """
        Extract metrics table from raw HTML string.
        """
        raw_html = self.data['raw']
        soup = BeautifulSoup(raw_html, 'html.parser')
        logger.debug("getting metrics table from raw HTML string")
        self.data['table'] = soup.findAll(
            'table', attrs={'cellpadding': '3'}
            )[0]

[docs]    def _check_table_format(self):
        """
        Check whether table is long (keys in one column, values in the other)
        or wide (keys in one row, values in the other).
        """
        table = self.data['table']
        if any([re.search('\xa0', td.text)
                for tr in table.findAll('tr')
                for td in tr.findAll('td')]):
            logger.debug("non-breaking space found in table; long format")
            return 'long'
        else:
            logger.debug("no non-breaking space found in table; wide format")
            return 'wide'

[docs]    def _parse_long(self):
        """
        Parse long-formatted table to dictionary.
        """
        table = self.data['table']

        metrics = {}
        for tr in table.findAll('tr'):
            for td in tr.findAll('td'):
                if re.search(r'^(\w+_*)+$', td.text):
                    td_key = td.text.lower().replace('\n', '')
                    logger.debug("found long metrics field '{}'".format(td_key))

                    td_val = td.next_sibling.string.replace('\xa0', '')
                    td_val = td_val.replace('\n', '')
                    logger.debug("with corresponding long value '{}'".format(td_val))

                    if len(td_val) and not re.search(r'[^\d.]+',
                                                     td_val.lower()):
                        td_val = float(td_val)
                    # The following is a bug fix for the fact that
                    # wide tables don't have values for some keys at the end of the row (LIBRARY, GROUP, etc.)
                    # Don't write metrics that have empty string keys in long tables.
                    # The goal is to match metric keys from wide and long tables.
                    # (Long picard-rnaseq tables are an aberration when the library is of very poor quality) 
                    if td_val != '':
                        metrics[td_key] = td_val
        logger.debug("parsed long metrics table: {}".format(metrics))
        return metrics

[docs]    def _parse_wide(self):
        """
        Parse wide-formatted table to dictionary.
        """
        table = self.data['table']
        metrics = {}
        for tr in table.findAll('tr'):
            if re.search('\xa0', tr.text):
                return {}
            for td in tr.findAll('td'):
                if re.search('^[A-Z]+', td.text):
                    td_keys = td.text.lower().split('\t')
                    logger.debug("found wide metrics fields: {}".format(td_keys))

                    td_vals = tr.next_sibling.next_sibling.text.split('\t')
                    logger.debug("found corresponding wide values: {}"
                                 .format(td_vals))

                    metrics_tmp = dict(list(zip(td_keys, td_vals)))
                    metrics.update({k: float(v) if not re.search(r'[^\d.]+', v)
                                    else v
                                    for k, v in list(metrics_tmp.items())})
        logger.debug("parsed wide metrics table: {}".format(metrics))
        return metrics

[docs]    def parse(self):
        """
        Parse metrics table and return dictionary.
        """
        try:
            self._read_file()
            self._get_table()
            table_format = self._check_table_format()
            if table_format == 'long':
                return self._parse_long()
            else:
                return self._parse_wide()
        except IndexError:
            logger.info("WARNING! Parsing file {} failed.".format(self.path))
            return {}