Files
Submissions-App/src/submissions/backend/db/models/controls.py
2024-09-12 12:03:02 -05:00

290 lines
12 KiB
Python

"""
All control related models.
"""
from __future__ import annotations
from sqlalchemy import Column, String, TIMESTAMP, JSON, INTEGER, ForeignKey
from sqlalchemy.orm import relationship, Query
import logging, re
from operator import itemgetter
from . import BaseClass
from tools import setup_lookup
from datetime import date, datetime
from typing import List, Literal
from dateutil.parser import parse
from re import Pattern
logger = logging.getLogger(f"submissions.{__name__}")
class ControlType(BaseClass):
"""
Base class of a control archetype.
"""
id = Column(INTEGER, primary_key=True) #: primary key
name = Column(String(255), unique=True) #: controltype name (e.g. MCS)
targets = Column(JSON) #: organisms checked for
instances = relationship("Control", back_populates="controltype") #: control samples created of this type.
def __repr__(self) -> str:
return f"<ControlType({self.name})>"
@classmethod
@setup_lookup
def query(cls,
name: str = None,
limit: int = 0
) -> ControlType | List[ControlType]:
"""
Lookup control archetypes in the database
Args:
name (str, optional): Name of the desired controltype. Defaults to None.
limit (int, optional): Maximum number of results to return (0 = all). Defaults to 0.
Returns:
ControlType | List[ControlType]: Single result if the limit = 1, else a list.
"""
query = cls.__database_session__.query(cls)
match name:
case str():
query = query.filter(cls.name == name)
limit = 1
case _:
pass
return cls.execute_query(query=query, limit=limit)
def get_subtypes(self, mode: Literal['kraken', 'matches', 'contains']) -> List[str]:
"""
Get subtypes associated with this controltype (currently used only for Kraken)
Args:
mode (str): analysis mode sub_type
Returns:
List[str]: list of subtypes available
"""
# NOTE: Get first instance since all should have same subtypes
# NOTE: Get mode of instance
jsoner = getattr(self.instances[0], mode)
# logger.debug(f"JSON out: {jsoner.keys()}")
try:
# NOTE: Pick genera (all should have same subtypes)
genera = list(jsoner.keys())[0]
except IndexError:
return []
# NOTE: remove items that don't have relevant data
subtypes = [item for item in jsoner[genera] if "_hashes" not in item and "_ratio" not in item]
return subtypes
@classmethod
def get_positive_control_types(cls) -> List[ControlType]:
"""
Gets list of Control types if they have targets
Returns:
List[ControlType]: Control types that have targets
"""
return [item for item in cls.query() if item.targets]
@classmethod
def build_positive_regex(cls) -> Pattern:
"""
Creates a re.Pattern that will look for positive control types
Returns:
Pattern: Constructed pattern
"""
strings = list(set([item.name.split("-")[0] for item in cls.get_positive_control_types()]))
return re.compile(rf"(^{'|^'.join(strings)})-.*", flags=re.IGNORECASE)
class Control(BaseClass):
"""
Base class of a control sample.
"""
id = Column(INTEGER, primary_key=True) #: primary key
parent_id = Column(String,
ForeignKey("_controltype.id", name="fk_control_parent_id")) #: primary key of control type
controltype = relationship("ControlType", back_populates="instances",
foreign_keys=[parent_id]) #: reference to parent control type
name = Column(String(255), unique=True) #: Sample ID
submitted_date = Column(TIMESTAMP) #: Date submitted to Robotics
contains = Column(JSON) #: unstructured hashes in contains.tsv for each organism
matches = Column(JSON) #: unstructured hashes in matches.tsv for each organism
kraken = Column(JSON) #: unstructured output from kraken_report
submission_id = Column(INTEGER, ForeignKey("_basicsubmission.id")) #: parent submission id
submission = relationship("BacterialCulture", back_populates="controls",
foreign_keys=[submission_id]) #: parent submission
refseq_version = Column(String(16)) #: version of refseq used in fastq parsing
kraken2_version = Column(String(16)) #: version of kraken2 used in fastq parsing
kraken2_db_version = Column(String(32)) #: folder name of kraken2 db
sample = relationship("BacterialCultureSample", back_populates="control") #: This control's submission sample
sample_id = Column(INTEGER,
ForeignKey("_basicsample.id", ondelete="SET NULL", name="cont_BCS_id")) #: sample id key
def __repr__(self) -> str:
return f"<Control({self.name})>"
def to_sub_dict(self) -> dict:
"""
Converts object into convenient dictionary for use in submission summary
Returns:
dict: output dictionary containing: Name, Type, Targets, Top Kraken results
"""
# logger.debug("loading json string into dict")
try:
kraken = self.kraken
except TypeError:
kraken = {}
# logger.debug("calculating kraken count total to use in percentage")
kraken_cnt_total = sum([kraken[item]['kraken_count'] for item in kraken])
# logger.debug("Creating new kraken.")
new_kraken = [dict(name=item, kraken_count=kraken[item]['kraken_count'],
kraken_percent="{0:.0%}".format(kraken[item]['kraken_count'] / kraken_cnt_total),
target=item in self.controltype.targets)
for item in kraken]
# logger.debug(f"New kraken before sort: {new_kraken}")
new_kraken = sorted(new_kraken, key=itemgetter('kraken_count'), reverse=True)
# logger.debug("setting targets")
if self.controltype.targets:
targets = self.controltype.targets
else:
targets = ["None"]
# logger.debug("constructing output dictionary")
output = {
"name": self.name,
"type": self.controltype.name,
"targets": ", ".join(targets),
"kraken": new_kraken[0:10]
}
return output
def convert_by_mode(self, mode: Literal['kraken', 'matches', 'contains']) -> list[dict]:
"""
split this instance into analysis types for controls graphs
Args:
mode (str): analysis type, 'contains', etc
Returns:
list[dict]: list of records
"""
output = []
# logger.debug("load json string for mode (i.e. contains, matches, kraken2)")
try:
data = self.__getattribute__(mode)
except TypeError:
data = {}
if data is None:
data = {}
# logger.debug(f"Length of data: {len(data)}")
# logger.debug("dict keys are genera of bacteria, e.g. 'Streptococcus'")
for genus in data:
_dict = dict(
name=self.name,
submitted_date=self.submitted_date,
genus=genus,
target='Target' if genus.strip("*") in self.controltype.targets else "Off-target"
)
# logger.debug("get Target or Off-target of genus")
# logger.debug("set 'contains_hashes', etc for genus")
for key in data[genus]:
_dict[key] = data[genus][key]
output.append(_dict)
# logger.debug("Have to triage kraken data to keep program from getting overwhelmed")
if "kraken" in mode:
output = sorted(output, key=lambda d: d[f"{mode}_count"], reverse=True)[:50]
return output
@classmethod
def get_modes(cls) -> List[str]:
"""
Get all control modes from database
Returns:
List[str]: List of control mode names.
"""
try:
# logger.debug("Creating a list of JSON columns in _controls table")
cols = [item.name for item in list(cls.__table__.columns) if isinstance(item.type, JSON)]
except AttributeError as e:
logger.error(f"Failed to get available modes from db: {e}")
cols = []
return cols
@classmethod
@setup_lookup
def query(cls,
control_type: ControlType | str | None = None,
start_date: date | str | int | None = None,
end_date: date | str | int | None = None,
control_name: str | None = None,
limit: int = 0
) -> Control | List[Control]:
"""
Lookup control objects in the database based on a number of parameters.
Args:
control_type (models.ControlType | str | None, optional): Control archetype. Defaults to None.
start_date (date | str | int | None, optional): Beginning date to search by. Defaults to 2023-01-01 if end_date not None.
end_date (date | str | int | None, optional): End date to search by. Defaults to today if start_date not None.
control_name (str | None, optional): Name of control. Defaults to None.
limit (int, optional): Maximum number of results to return (0 = all). Defaults to 0.
Returns:
models.Control|List[models.Control]: Control object of interest.
"""
query: Query = cls.__database_session__.query(cls)
# NOTE: by control type
match control_type:
case ControlType():
# logger.debug(f"Looking up control by control type: {control_type}")
query = query.filter(cls.controltype == control_type)
case str():
# logger.debug(f"Looking up control by control type: {control_type}")
query = query.join(ControlType).filter(ControlType.name == control_type)
case _:
pass
# NOTE: by date range
if start_date is not None and end_date is None:
logger.warning(f"Start date with no end date, using today.")
end_date = date.today()
if end_date is not None and start_date is None:
logger.warning(f"End date with no start date, using Jan 1, 2023")
start_date = date(2023, 1, 1)
if start_date is not None:
match start_date:
case date():
# logger.debug(f"Lookup control by start date({start_date})")
start_date = start_date.strftime("%Y-%m-%d")
case int():
# logger.debug(f"Lookup control by ordinal start date {start_date}")
start_date = datetime.fromordinal(
datetime(1900, 1, 1).toordinal() + start_date - 2).date().strftime("%Y-%m-%d")
case _:
# logger.debug(f"Lookup control with parsed start date {start_date}")
start_date = parse(start_date).strftime("%Y-%m-%d")
match end_date:
case date():
# logger.debug(f"Lookup control by end date({end_date})")
end_date = end_date.strftime("%Y-%m-%d")
case int():
# logger.debug(f"Lookup control by ordinal end date {end_date}")
end_date = datetime.fromordinal(datetime(1900, 1, 1).toordinal() + end_date - 2).date().strftime(
"%Y-%m-%d")
case _:
# logger.debug(f"Lookup control with parsed end date {end_date}")
end_date = parse(end_date).strftime("%Y-%m-%d")
# logger.debug(f"Looking up BasicSubmissions from start date: {start_date} and end date: {end_date}")
query = query.filter(cls.submitted_date.between(start_date, end_date))
match control_name:
case str():
# logger.debug(f"Lookup control by name {control_name}")
query = query.filter(cls.name.startswith(control_name))
limit = 1
case _:
pass
return cls.execute_query(query=query, limit=limit)