#!/usr/bin/env python3 import argparse from pathlib import Path from tempfile import TemporaryDirectory import matplotlib.pyplot as plt import pandas as pd import streamlit as st import plotting_on_genome as pog # st.set_page_config(layout="wide") if "stage" not in st.session_state: st.session_state.stage = 0 if "insert_types" not in st.session_state: st.session_state.insert_types = "both" if "workdir" not in st.session_state: st.session_state.workdir = None class TempDirManager: def __init__(self, dirpath=None): self.dirpath = dirpath def __enter__(self): if self.dirpath is None: self.temp_dir = TemporaryDirectory() return self.temp_dir.name else: Path(self.dirpath).mkdir(exist_ok=True) return self.dirpath def __exit__(self, exc_type, exc_value, traceback): if self.dirpath is None: self.temp_dir.cleanup() @st.cache_data def run_pipeline( seq_fh, genome_fh, email, search_term, retmax, fwd_suf, rev_suf, workdir=None ): with st.spinner("Processing..."), TempDirManager(workdir) as work_dir: dirpath = Path(work_dir) if genome_fh is not None: genome_path = str(dirpath / genome_fh.name) with open(genome_path, "wb") as fh: fh.write(genome_fh.getvalue()) else: genome_path = None seq_path = str(dirpath / "seqs.fasta") with open(seq_path, "wb") as fh: for seq in seq_fh: # Make sure that there is new line between individual seqs fh.write(seq.getvalue()) fh.write(b"\n") res = pog.InsertsDict( seq_file=seq_path, work_dir=dirpath, genome_file=genome_path, search_term=search_term, email=email, retmax=retmax, fwd_suffix=fwd_suf, rev_suffix=rev_suf, ) return res def submit(*args): st.session_state.pipeline = run_pipeline(*args) st.session_state.stage = 1 def get_main_inputs(): parser = argparse.ArgumentParser("PlottingOnGenome") parser.add_argument("--workdir", action="store_true") args = parser.parse_args() # Radio buttons to switch between text input and file upload genome_src = st.radio("Genome:", ("NCBI", "File")) # Text inputs genome_fh = None search_term = None email = None retmax = None if genome_src == "NCBI": search_term = st.text_input("Search term:", key="search_term") email = st.text_input("Email:", key="email") retmax = st.text_input( "Retmax (Max. # of records from NCBI)", 200, key="retmax" ) else: genome_fh = st.file_uploader( "Upload genome:", type=("gbk", "gff"), key="genome" ) seq_fh = st.file_uploader( "Sequence file:", type=["fasta", "fas", "fna"], key="seqs", accept_multiple_files=True, ) fwd_suf = st.text_input("Forward suffix:", "_F", key="fwd_suf") rev_suf = st.text_input("Reverse suffix:", "_R", key="rev_suf") st.session_state.insert_types = st.selectbox( "insert types:", ["both", "matched", "unmatched"] ) if args.workdir: st.session_state.workdir = st.text_input("workdir", "Output") st.button( "Submit", on_click=submit, args=[ seq_fh, genome_fh, email, search_term, retmax, fwd_suf, rev_suf, st.session_state.workdir, ], ) @st.cache_data def convert_df(df, **kwargs): return df.to_csv(**kwargs).encode("utf-8") def show_results(): inserts_all = st.session_state.pipeline insert_types = st.session_state.insert_types filter_threshold = st.slider("Filter threshold:", 0.0, 1.0, 0.7) buffer = st.slider( "View window size:", min_value=0, max_value=10000, value=4000, help="Number of bases either side of the insert", ) params = dict(insert_types=insert_types, filter_threshold=filter_threshold) df_inserts = inserts_all.to_dataframe(**params) df_genes = inserts_all.genes_to_dataframe(**params, buffer=buffer) st.download_button( label="Download all inserts as CSV", data=df_inserts.pipe(convert_df, index=False), file_name="inserts.csv", mime="text/csv", use_container_width=True, ) st.download_button( label="Download all genes as CSV", data=df_genes.pipe(convert_df, index=False), file_name="genes.csv", mime="text/csv", use_container_width=True, ) option = st.selectbox( "plot type:", ["plot inserts", "plot genome", "plot insert dists"], None, ) if inserts_all is not None: if option == "plot inserts": st.write("Features to dispaly:") ft_checkboxes = { "CDS": st.checkbox("CDS", value=True), "gene": st.checkbox("gene", value=True), } feature_types = [k for k, v in ft_checkboxes.items() if v] seq_id = st.selectbox("Select sequence id:", inserts_all.seq_ids) inserts = inserts_all.get(seq_id, **params) if len(inserts): st.write(df_genes.query("seq_id == @seq_id")) for insert in inserts: fig, axs = plt.subplots(2, 1, figsize=(10, 6), height_ratios=[3, 5]) fig.suptitle(f"Insert {insert.idx}") axs = insert.plot( buffer=buffer, axs=axs, feature_types=feature_types ) st.pyplot(fig) plt.close() else: st.write(f"No inserts found for {seq_id}!") elif option == "plot genome": labels = st.toggle("Labels") inserts = [] if st.toggle("Plot inserts", True): inserts = inserts_all.get(**params) st.write(df_inserts) fig, ax = plt.subplots(figsize=(10, 10 * (1 + 2 * labels))) ax = inserts_all.plot(inserts, show_labels=labels, ax=ax) st.pyplot(fig, use_container_width=True) plt.close() elif option == "plot insert dists": plot_type = st.radio( "Plot type:", ["histogram", "violinplot+boxplot+stripplot"] ) inserts = inserts_all.get(**params) if plot_type == "histogram": fig, axs = plt.subplots(1, 2, figsize=(12, 5)) pog.plot_histogram(inserts, axs=axs) st.pyplot(fig) elif plot_type == "violinplot+boxplot+stripplot": fig, axs = plt.subplots(1, 2, figsize=(12, 5)) pog.plot_dists(inserts, axs=axs) st.pyplot(fig) else: raise ValueError("Incorrect plot type") plt.close() def main(): col1, col2 = st.columns((6, 1), gap="large", vertical_alignment="bottom") with col1: st.title("PlottingOnGenome") with col2: if st.session_state.stage == 1: if st.button("Reset"): st.session_state.stage = 0 st.rerun() if st.session_state.stage == 0: get_main_inputs() if st.session_state.stage >= 1: show_results() if __name__ == "__main__": main()