Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- [#896](https://github.com/nf-core/mag/pull/896) - Remove obsolete execution command from README (by @dialvarezs)
- [#907](https://github.com/nf-core/mag/pull/907) - Include refined bins from all binners in the `DASTool/bins` output folder (by @AlexHoratio)
- [#911](https://github.com/nf-core/mag/pull/911) - Ensure column order is consistent when generating depth summaries to prevent swapped results on merged depth summary (by @dialvarezs)

### `Dependencies`

Expand Down
107 changes: 47 additions & 60 deletions bin/get_mag_depths.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
#!/usr/bin/env python

## Originally written by Sabrina Krakau and released under the MIT license.
## Originally written by Sabrina Krakau and updated by Diego Alvarez. Released under the MIT license.
## See git repository (https://github.com/nf-core/mag) for full license text.

import sys
import argparse
import os.path
import pandas as pd
import csv
import gzip
import os.path
import statistics
import sys

from Bio import SeqIO

Expand All @@ -29,17 +28,14 @@ def parse_args(args=None):
"--depths",
required=True,
metavar="FILE",
help="(Compressed) TSV file containing contig depths for each sample: contigName, contigLen, totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...].",
)
parser.add_argument(
"-a", "--assembler", required=True, type=str, help="Assembler name."
)
parser.add_argument(
"-i", "--id", required=True, type=str, help="Sample or group id."
)
parser.add_argument(
"-m", "--binner", required=True, type=str, help="Binning method."
help=(
"Produces (compressed) TSV file containing contig depths for each sample: contigName, contigLen, "
+ "totalAvgDepth, sample1_avgDepth, sample1_var [, sample2_avgDepth, sample2_var, ...]."
),
)
parser.add_argument("-a", "--assembler", required=True, type=str, help="Assembler name.")
parser.add_argument("-i", "--id", required=True, type=str, help="Sample or group id.")
parser.add_argument("-m", "--binner", required=True, type=str, help="Binning method.")
return parser.parse_args(args)


Expand All @@ -53,57 +49,48 @@ def main(args=None):
sample_names = []
dict_contig_depths = {}
with gzip.open(args.depths, "rt") as infile:
reader = csv.reader(infile, delimiter="\t")
# process header
header = next(reader)
for sample in range(int((len(header) - 3) / 2)):
col_name = header[3 + 2 * sample]
reader = csv.DictReader(infile, delimiter="\t")
# process header to extract sample names from column names
depth_columns = []
for col_name in reader.fieldnames[3::2]: # Every 2nd column starting from index 3
# retrieve sample name: "<assembler>-<id>-<other sample_name>.bam"
sample_name = col_name[len(args.assembler) + 1 + len(args.id) + 1 : -4]
sample_names.append(sample_name)
depth_columns.append(col_name)
# process contig depths
for row in reader:
contig_depths = []
for sample in range(int((len(row) - 3) / 2)):
contig_depths.append(float(row[3 + 2 * sample]))
dict_contig_depths[str(row[0])] = contig_depths

# Initialize output files
n_samples = len(sample_names)
with open(
args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "w"
) as outfile:
print("bin", "\t".join(sample_names), sep="\t", file=outfile)

# for each bin, access contig depths and compute mean bin depth (for all samples)
for file in args.bins:
all_depths = [[] for i in range(n_samples)]

if file.endswith(".gz"):
with gzip.open(file, "rt") as infile:
for rec in SeqIO.parse(infile, "fasta"):
contig_depths = dict_contig_depths[rec.id]
for sample in range(n_samples):
all_depths[sample].append(contig_depths[sample])
else:
with open(file, "rt") as infile:
for rec in SeqIO.parse(infile, "fasta"):
contig_depths = dict_contig_depths[rec.id]
for sample in range(n_samples):
all_depths[sample].append(contig_depths[sample])

binname = os.path.basename(file)
with open(
args.assembler + "-" + args.binner + "-" + args.id + "-binDepths.tsv", "a"
) as outfile:
print(
binname,
"\t".join(
str(statistics.median(sample_depths))
for sample_depths in all_depths
),
sep="\t",
file=outfile,
contig_depths = {}
for sample_name, col_name in zip(sample_names, depth_columns):
contig_depths[sample_name] = float(row[col_name])
dict_contig_depths[row[reader.fieldnames[0]]] = contig_depths

sample_names = sorted(sample_names)

with open(f"{args.assembler}-{args.binner}-{args.id}-binDepths.tsv", "w") as outfile:
writer = csv.writer(outfile, delimiter="\t")
writer.writerow(["bin", *sample_names])

# for each bin, access contig depths and compute mean bin depth (for all samples)
for file in args.bins:
all_depths = {sample: [] for sample in sample_names}

if file.endswith(".gz"):
with gzip.open(file, "rt") as infile:
for rec in SeqIO.parse(infile, "fasta"):
contig_depths = dict_contig_depths[rec.id]
for sample in sample_names:
all_depths[sample].append(contig_depths[sample])
else:
with open(file, "rt") as infile:
for rec in SeqIO.parse(infile, "fasta"):
contig_depths = dict_contig_depths[rec.id]
for sample in sample_names:
all_depths[sample].append(contig_depths[sample])

binname = os.path.basename(file)

writer.writerow(
[binname, *[statistics.median(all_depths[sample]) for sample in sample_names]],
)


Expand Down