Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 102 additions & 31 deletions PostProcess/add_risk_score.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,107 @@
#!/usr/bin/env python
"""Adds risk score columns to a report file and computes risk scores for each entry.

This module processes a tab-separated report file, calculates risk score differences,
and writes the results to a new file with updated columns. It supports both primary
and alternative target scenarios based on user input.
"""

from typing import List, Tuple

import sys

# risk score column names
RISKCOLNAMES = ["Highest_CFD_Risk_Score", "Highest_CFD_Absolute_Risk_Score"]


def add_risk_score_columns(header: List[str], alternative: bool) -> List[str]:
"""Appends risk score columns and optionally a cluster ID to the header list.

This function updates the provided header list by adding predefined risk score
columns, and adds a cluster ID column if the alternative flag is set.

Args:
header: The list of column names to be updated.
alternative: Whether to add the cluster ID column.

Returns:
List[str]: The updated list of column names including risk score columns.
"""
header.extend(iter(RISKCOLNAMES))
if alternative:
header.append("CLUSTER_ID")
return header


def _compute_risk_score(line: str) -> Tuple[float, float]:
"""Calculates the risk score difference and its absolute value from a line of
report data.

This function extracts two score values from the input line, computes their
difference, and returns both the difference and its absolute value.

Args:
line: A tab-separated string containing report data.

Returns:
Tuple[float, float]: The risk score difference and its absolute value.
"""
fields = line.strip().split("\t")
score, score_alt = float(fields[20]), float(fields[21])
score_diff = score - score_alt
return score_diff, abs(score_diff)


def compute_risk_score(report_fname: str, report_outfname: str, alternative: bool):
"""Reads a report file, computes risk scores for each line, and writes the
results to a new file.

This function processes the input report, appends risk score columns, and
outputs the updated data. It handles both primary and alternative target
scenarios based on the provided flag.

Args:
report_fname: Path to the input report file.
report_outfname: Path to the output report file.
alternative: Whether to process as alternative targets.

Returns:
None

Raises:
OSError: If an error occurs while reading or writing files.
"""
try:
with open(report_fname, mode="r") as fin, open(
report_outfname, mode="w"
) as fout:
header = fin.readline().strip().split("\t") # read file header
header = add_risk_score_columns(
header, alternative
) # append risk score columns
fout.write("\t".join(header) + "\n") # write header to out put report
for line in fin:
score_diff, score_diff_abs = _compute_risk_score(line)
fout.write(f"{line}\t{score_diff}\t{score_diff_abs}\n")
except (IOError, Exception) as e:
raise OSError(
f"An error occurred while computing risk scores for {report_fname}"
) from e


def risk_score() -> None:
"""Parses command-line arguments and computes risk scores for a report file.

This function reads input and output file paths and a flag from the command
line, then processes the report to add risk score columns accordingly.

Returns:
None
"""
report_fname, report_outfname, alternative = sys.argv[1:4] # read input args
alternative = alternative == "True" # are primary or alternative targets?
# compute risk score on CFD/CRISTA
compute_risk_score(report_fname, report_outfname, alternative)


file_in = sys.argv[1]
file_out = sys.argv[2]
alt = sys.argv[3]
alt = alt == "True"
with open(file_in, "r") as fin:
with open(file_out, "w") as fout:
header = fin.readline().strip().split("\t")
# header.insert(22, 'Highest_CFD_Risk_Score')
# header.insert(23, 'Highest_CFD_Absolute_Risk_Score')
# header.append('MMBLG_CFD_Risk_Score')
# header.append('MMBLG_CFD_Absolute_Risk_Score')
header.append("Highest_CFD_Risk_Score")
header.append("Highest_CFD_Absolute_Risk_Score")
if alt:
header.append("CLUSTER_ID")
fout.write("\t".join(header) + "\n")
for line in fin:
splitted = line.strip().split("\t")
cfd_diff = float(splitted[20]) - float(splitted[21])
abs_diff = abs(cfd_diff)
# mmblg_cfd_diff = float(splitted[42]) - float(splitted[43])
# mmblg_abs_diff = abs(mmblg_cfd_diff)
fout.write(
"\t".join(splitted) + "\t" + str(cfd_diff) + "\t" + str(abs_diff) + "\n"
)
# if alt:
# fout.write('\t'.join(splitted[:22])+'\t'+"{:.3f}".format(cfd_diff)+'\t'+"{:.3f}".format(abs_diff)+"\t"+"\t".join(
# splitted[22:-1])+'\t'+"{:.3f}".format(mmblg_cfd_diff)+'\t'+"{:.3f}".format(mmblg_abs_diff)+"\t"+splitted[-1]+'\n')
# else:
# fout.write('\t'.join(splitted[:22])+'\t'+"{:.3f}".format(cfd_diff)+'\t'+"{:.3f}".format(abs_diff)+"\t"+"\t".join(
# splitted[22:])+'\t'+"{:.3f}".format(mmblg_cfd_diff)+'\t'+"{:.3f}".format(mmblg_abs_diff)+'\n')
risk_score()
2 changes: 2 additions & 0 deletions PostProcess/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def annotate_target(chrom: str, start: int, stop: int, annotation: TabixFile) ->
Returns:
Comma-separated string of annotation features for the target.
"""
if chrom not in annotation.contigs:
return "n" # contig not in input annotation file
target_anns = {
feature.split()[3]
for feature in annotation.fetch(chrom, start, stop)
Expand Down
35 changes: 0 additions & 35 deletions PostProcess/post_analisi_indel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,38 +34,3 @@ sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_na
./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder"
rm "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
rm "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"

# vcf_name=$(basename $3)
# guide_file=$4
# guide_name=$(basename $4)
# mm=$5
# bDNA=$6
# bRNA=$7
# annotation_file=$8
# annotation_name=$(basename $8)
# pam_file=$9
# pam_name=$(basename $9)
# # sampleID=${10}
# dict_folder=${10}

# final_res=${11}
# final_res_alt=${12}

# key=${13}

# echo "Processing INDELs results for $key, starting post-analysis"
# true_chr=$key
# fake_chr="fake$true_chr"

# # create file to prevent void grep failing
# touch "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
# # create file to prevent void grep failing
# touch "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
# awk -v fake_chr="$fake_chr" '$0 ~ fake_chr {print}' "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt" >"$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"
# header=$(head -1 $output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt)
# sed -i 1i"$header" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr"

# ./analisi_indels_NNN.sh "$output_folder/crispritz_targets/${ref_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/crispritz_targets/indels_${ref_name}+${vcf_name}_${pam_name}_${guide_name}_${mm}_${bDNA}_${bRNA}.targets.txt.$true_chr" "$output_folder/${fake_chr}_${pam_name}_${guide_name}_${annotation_name}_${mm}_${bDNA}_${bRNA}" "$annotation_file" "$dict_folder/log_indels_$vcf_name" "$ref_folder/$true_chr.fa" $mm $bDNA $bRNA "$guide_file" "$pam_file" "$output_folder" || {
# echo "CRISPRme ERROR: indels analysis failed (script: ${0} line $((LINENO - 1)))" >&2
# exit 1
# }
91 changes: 71 additions & 20 deletions PostProcess/remove_n_and_dots.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,87 @@
#!/usr/bin/env python
"""
Created on Sun May 30 15:49:39 2021
"""Processes a report file to replace 'n' and '.' values with 'NA'.

@author: franc
This module reads a tab-separated report file in chunks, cleans specific values,
and overwrites the original file with the cleaned data. It is intended for use
in post-processing steps where standardized missing value representation is required.
"""

"""
Script used to replace n and . with NA in bestMerge
"""

import warnings
from pandas.io.parsers import TextFileReader

warnings.simplefilter(action="ignore", category=FutureWarning)
import pandas as pd

import subprocess
import warnings
import sys
import os

in_path = sys.argv[1]
warnings.simplefilter(action="ignore", category=FutureWarning)

# dataframe read chunksize
CHUNKSIZE = 50000


def read_report_chunks(report_fname: str) -> TextFileReader:
"""Reads a tab-separated report file in chunks for efficient processing.

This function returns an iterator that yields DataFrame chunks from the
specified file.

Args:
report_fname: Path to the tab-separated report file.

Returns:
TextFileReader: An iterator over DataFrame chunks.
"""
return pd.read_csv(report_fname, sep="\t", chunksize=CHUNKSIZE)


def polish_report(report_chunks: TextFileReader, report_fname: str) -> None:
"""Cleans and writes report data by replacing specific values with 'NA'.

This function processes each chunk of the report, replacing 'n' and '.' values
with 'NA', and writes the cleaned data to a temporary file.

Args:
report_chunks: An iterator over DataFrame chunks from the report.
report_fname: Path to the original report file.

chunksize_ = 50000
chunks = pd.read_csv(in_path, sep="\t", chunksize=chunksize_)
Returns:
None
"""
header = True # only for first iteration
for chunk in report_chunks:
assert "rsID" in chunk.columns.tolist()
chunk: pd.DataFrame = chunk.replace("n", "NA") # replace ns with NAs
chunk["rsID"] = chunk["rsID"].str.replace(
".", "NA"
) # replace . in rsids with NAs
chunk.to_csv(
f"{report_fname}.tmp", header=header, mode="a", sep="\t", index=False
)
header = False # not required anymore


header = True
for chunk in chunks:
def remove_n_dots() -> None:
"""Replaces 'n' and '.' values in a report file with 'NA' and overwrites the
original file.

chunk = chunk.replace("n", "NA")
# chunk = chunk.replace(regex=['\*.,\*', '\*,.\*'], value='NA')
chunk["rsID"] = chunk["rsID"].str.replace(".", "NA")
This function reads a report file, cleans specific values, and saves the cleaned
data back to the original file.

chunk.to_csv(in_path + ".tmp", header=header, mode="a", sep="\t", index=False)
Returns:
None
"""
report_fname = sys.argv[1] # read input report filename
if not os.path.isfile(report_fname) or os.stat(report_fname).st_size <= 0:
raise FileNotFoundError(f"Report {report_fname} not found or empty")
# remove ns and dots and replace them with NAs
polish_report(read_report_chunks(report_fname), report_fname)
if not os.path.isfile(f"{report_fname}.tmp"):
raise FileNotFoundError(f"Polished report {report_fname}.tmp not created?")
code = subprocess.call(f"mv {report_fname}.tmp {report_fname}")
if code != 0:
raise subprocess.SubprocessError(f"Failed renaming {report_fname}.tmp")

header = False

os.system(f"mv {in_path}.tmp {in_path}")
remove_n_dots()
4 changes: 2 additions & 2 deletions PostProcess/submit_job_automated_new_multiple_vcfs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ email=${17} # email address (website only)
echo -e "MAIL: $email"
echo -e "CPU used: $ncpus"

#used to solve base editor check in resultintegration phase
# used to solve base editor check in result-integration phase
base_check_start=${18}
base_check_end=${19}
base_check_set=${20}
Expand Down Expand Up @@ -717,7 +717,7 @@ if [ -s $logerror ]; then
rm -f $output_folder/*.bestCFD.txt $output_folder/*.bestmmblg.txt $output_folder/*.bestCRISTA.txt $output_folder/*.bestMerge.txt $output_folder/*.altMerge.txt
exit 1
fi
# remove Ns and dots from rsID from primary targets files
# remove Ns and dots from rsID from alternative targets files
./remove_n_and_dots.py $final_res_alt.bestCFD.txt &
./remove_n_and_dots.py $final_res_alt.bestmmblg.txt &
./remove_n_and_dots.py $final_res_alt.bestCRISTA.txt &
Expand Down