Improvements to submission querying.

This commit is contained in:
lwark
2024-04-26 15:25:24 -05:00
parent b619d751b8
commit 5378c79933
7 changed files with 164 additions and 136 deletions

View File

@@ -1,59 +1,59 @@
'''
"""
All control related models.
'''
"""
from __future__ import annotations
from sqlalchemy import Column, String, TIMESTAMP, JSON, INTEGER, ForeignKey
from sqlalchemy.orm import relationship, Query
import logging, re, sys
import logging, re
from operator import itemgetter
from . import BaseClass
from tools import setup_lookup
from datetime import date, datetime
from typing import List
from dateutil.parser import parse
from re import Pattern
logger = logging.getLogger(f"submissions.{__name__}")
class ControlType(BaseClass):
"""
Base class of a control archetype.
"""
id = Column(INTEGER, primary_key=True) #: primary key
name = Column(String(255), unique=True) #: controltype name (e.g. MCS)
targets = Column(JSON) #: organisms checked for
instances = relationship("Control", back_populates="controltype") #: control samples created of this type.
"""
id = Column(INTEGER, primary_key=True) #: primary key
name = Column(String(255), unique=True) #: controltype name (e.g. MCS)
targets = Column(JSON) #: organisms checked for
instances = relationship("Control", back_populates="controltype") #: control samples created of this type.
def __repr__(self) -> str:
return f"<ControlType({self.name})>"
@classmethod
@setup_lookup
def query(cls,
name:str=None,
limit:int=0
) -> ControlType|List[ControlType]:
def query(cls,
name: str = None,
limit: int = 0
) -> ControlType | List[ControlType]:
"""
Lookup control archetypes in the database
Args:
name (str, optional): Control type name (limits results to 1). Defaults to None.
limit (int, optional): Maximum number of results to return. Defaults to 0.
name (str, optional): Name of the desired controltype. Defaults to None.
limit (int, optional): Maximum number of results to return (0 = all). Defaults to 0.
Returns:
models.ControlType|List[models.ControlType]: ControlType(s) of interest.
"""
ControlType | List[ControlType]: Single result if the limit = 1, else a list.
"""
query = cls.__database_session__.query(cls)
match name:
case str():
query = query.filter(cls.name==name)
query = query.filter(cls.name == name)
limit = 1
case _:
pass
return cls.query_return(query=query, limit=limit)
def get_subtypes(self, mode:str) -> List[str]:
return cls.execute_query(query=query, limit=limit)
def get_subtypes(self, mode: str) -> List[str]:
"""
Get subtypes associated with this controltype
@@ -62,56 +62,68 @@ class ControlType(BaseClass):
Returns:
List[str]: list of subtypes available
"""
"""
# Get first instance since all should have same subtypes
# outs = self.instances[0]
# Get mode of instance
# jsoner = json.loads(getattr(outs, mode))
jsoner = getattr(self.instances[0], mode)
logger.debug(f"JSON out: {jsoner.keys()}")
# logger.debug(f"JSON out: {jsoner.keys()}")
try:
# Pick genera (all should have same subtypes)
genera = list(jsoner.keys())[0]
except IndexError:
return []
# remove items that don't have relevant data
subtypes = [item for item in jsoner[genera] if "_hashes" not in item and "_ratio" not in item]
return subtypes
@classmethod
def get_positive_control_types(cls):
def get_positive_control_types(cls) -> List[ControlType]:
"""
Gets list of Control types if they have targets
Returns:
List[ControlType]: Control types that have targets
"""
return [item for item in cls.query() if item.targets != []]
@classmethod
def build_positive_regex(cls):
def build_positive_regex(cls) -> Pattern:
"""
Creates a re.Pattern that will look for positive control types
Returns:
Pattern: Constructed pattern
"""
strings = list(set([item.name.split("-")[0] for item in cls.get_positive_control_types()]))
return re.compile(rf"(^{'|^'.join(strings)})-.*", flags=re.IGNORECASE)
class Control(BaseClass):
"""
Base class of a control sample.
"""
id = Column(INTEGER, primary_key=True) #: primary key
parent_id = Column(String, ForeignKey("_controltype.id", name="fk_control_parent_id")) #: primary key of control type
controltype = relationship("ControlType", back_populates="instances", foreign_keys=[parent_id]) #: reference to parent control type
name = Column(String(255), unique=True) #: Sample ID
submitted_date = Column(TIMESTAMP) #: Date submitted to Robotics
contains = Column(JSON) #: unstructured hashes in contains.tsv for each organism
matches = Column(JSON) #: unstructured hashes in matches.tsv for each organism
kraken = Column(JSON) #: unstructured output from kraken_report
submission_id = Column(INTEGER, ForeignKey("_basicsubmission.id")) #: parent submission id
submission = relationship("BacterialCulture", back_populates="controls", foreign_keys=[submission_id]) #: parent submission
refseq_version = Column(String(16)) #: version of refseq used in fastq parsing
kraken2_version = Column(String(16)) #: version of kraken2 used in fastq parsing
kraken2_db_version = Column(String(32)) #: folder name of kraken2 db
sample = relationship("BacterialCultureSample", back_populates="control") #: This control's submission sample
sample_id = Column(INTEGER, ForeignKey("_basicsample.id", ondelete="SET NULL", name="cont_BCS_id")) #: sample id key
"""
id = Column(INTEGER, primary_key=True) #: primary key
parent_id = Column(String,
ForeignKey("_controltype.id", name="fk_control_parent_id")) #: primary key of control type
controltype = relationship("ControlType", back_populates="instances",
foreign_keys=[parent_id]) #: reference to parent control type
name = Column(String(255), unique=True) #: Sample ID
submitted_date = Column(TIMESTAMP) #: Date submitted to Robotics
contains = Column(JSON) #: unstructured hashes in contains.tsv for each organism
matches = Column(JSON) #: unstructured hashes in matches.tsv for each organism
kraken = Column(JSON) #: unstructured output from kraken_report
submission_id = Column(INTEGER, ForeignKey("_basicsubmission.id")) #: parent submission id
submission = relationship("BacterialCulture", back_populates="controls",
foreign_keys=[submission_id]) #: parent submission
refseq_version = Column(String(16)) #: version of refseq used in fastq parsing
kraken2_version = Column(String(16)) #: version of kraken2 used in fastq parsing
kraken2_db_version = Column(String(32)) #: folder name of kraken2 db
sample = relationship("BacterialCultureSample", back_populates="control") #: This control's submission sample
sample_id = Column(INTEGER,
ForeignKey("_basicsample.id", ondelete="SET NULL", name="cont_BCS_id")) #: sample id key
def __repr__(self) -> str:
"""
Returns:
str: Representation of self
"""
return f"<Control({self.name})>"
def to_sub_dict(self) -> dict:
@@ -120,7 +132,7 @@ class Control(BaseClass):
Returns:
dict: output dictionary containing: Name, Type, Targets, Top Kraken results
"""
"""
# logger.debug("loading json string into dict")
try:
# kraken = json.loads(self.kraken)
@@ -133,7 +145,8 @@ class Control(BaseClass):
for item in kraken:
# logger.debug("calculating kraken percent (overwrites what's already been scraped)")
kraken_percent = kraken[item]['kraken_count'] / kraken_cnt_total
new_kraken.append({'name': item, 'kraken_count':kraken[item]['kraken_count'], 'kraken_percent':"{0:.0%}".format(kraken_percent)})
new_kraken.append({'name': item, 'kraken_count': kraken[item]['kraken_count'],
'kraken_percent': "{0:.0%}".format(kraken_percent)})
new_kraken = sorted(new_kraken, key=itemgetter('kraken_count'), reverse=True)
# logger.debug("setting targets")
if self.controltype.targets == []:
@@ -142,14 +155,14 @@ class Control(BaseClass):
targets = self.controltype.targets
# logger.debug("constructing output dictionary")
output = {
"name" : self.name,
"type" : self.controltype.name,
"targets" : ", ".join(targets),
"kraken" : new_kraken[0:5]
"name": self.name,
"type": self.controltype.name,
"targets": ", ".join(targets),
"kraken": new_kraken[0:5]
}
return output
def convert_by_mode(self, mode:str) -> list[dict]:
def convert_by_mode(self, mode: str) -> list[dict]:
"""
split this instance into analysis types for controls graphs
@@ -158,7 +171,7 @@ class Control(BaseClass):
Returns:
list[dict]: list of records
"""
"""
output = []
# logger.debug("load json string for mode (i.e. contains, matches, kraken2)")
try:
@@ -191,7 +204,7 @@ class Control(BaseClass):
Returns:
List[str]: List of control mode names.
"""
"""
try:
# logger.debug("Creating a list of JSON columns in _controls table")
cols = [item.name for item in list(cls.__table__.columns) if isinstance(item.type, JSON)]
@@ -202,13 +215,13 @@ class Control(BaseClass):
@classmethod
@setup_lookup
def query(cls,
control_type:ControlType|str|None=None,
start_date:date|str|int|None=None,
end_date:date|str|int|None=None,
control_name:str|None=None,
limit:int=0
) -> Control|List[Control]:
def query(cls,
control_type: ControlType | str | None = None,
start_date: date | str | int | None = None,
end_date: date | str | int | None = None,
control_name: str | None = None,
limit: int = 0
) -> Control | List[Control]:
"""
Lookup control objects in the database based on a number of parameters.
@@ -221,16 +234,16 @@ class Control(BaseClass):
Returns:
models.Control|List[models.Control]: Control object of interest.
"""
"""
query: Query = cls.__database_session__.query(cls)
# by control type
match control_type:
case ControlType():
# logger.debug(f"Looking up control by control type: {control_type}")
query = query.filter(cls.controltype==control_type)
query = query.filter(cls.controltype == control_type)
case str():
# logger.debug(f"Looking up control by control type: {control_type}")
query = query.join(ControlType).filter(ControlType.name==control_type)
query = query.join(ControlType).filter(ControlType.name == control_type)
case _:
pass
# by date range
@@ -247,7 +260,8 @@ class Control(BaseClass):
start_date = start_date.strftime("%Y-%m-%d")
case int():
# logger.debug(f"Lookup control by ordinal start date {start_date}")
start_date = datetime.fromordinal(datetime(1900, 1, 1).toordinal() + start_date - 2).date().strftime("%Y-%m-%d")
start_date = datetime.fromordinal(
datetime(1900, 1, 1).toordinal() + start_date - 2).date().strftime("%Y-%m-%d")
case _:
# logger.debug(f"Lookup control with parsed start date {start_date}")
start_date = parse(start_date).strftime("%Y-%m-%d")
@@ -257,7 +271,8 @@ class Control(BaseClass):
end_date = end_date.strftime("%Y-%m-%d")
case int():
# logger.debug(f"Lookup control by ordinal end date {end_date}")
end_date = datetime.fromordinal(datetime(1900, 1, 1).toordinal() + end_date - 2).date().strftime("%Y-%m-%d")
end_date = datetime.fromordinal(datetime(1900, 1, 1).toordinal() + end_date - 2).date().strftime(
"%Y-%m-%d")
case _:
# logger.debug(f"Lookup control with parsed end date {end_date}")
end_date = parse(end_date).strftime("%Y-%m-%d")
@@ -270,5 +285,4 @@ class Control(BaseClass):
limit = 1
case _:
pass
return cls.query_return(query=query, limit=limit)
return cls.execute_query(query=query, limit=limit)