最近分析了一批单细胞数据,前期的比对定量使用公司自己的配套软件SeekSoulTools分析,用起来类似于cellranger一行命令完成定量并生成报告。通常单细胞的定量软件都会生成一个网页报告,可以方便查看一些QC指标。但是,样本的网页报告都是独立的,多个样本查看起来并不方便。这时候就可以用MultiQC方便地汇集多个样本的QC结果。但是,MultiQC原生并不支持该软件,为了方便重复使用,所以利用MultiQC插件功能实现该软件QC结果的汇总。
解析QC源文件
SeekSoulTools会在结果中生成一个json格式的源文件,里面包含所有的QC结果,汇总结果只需解析该文件即可。
def parse_seeksoultools_log(self, f) -> Tuple[Optional[str], Dict]:
"""Parse the JSON output from seeksoultools and save the summary statistics"""
try:
parsed_json = json.load(f["f"])
except json.JSONDecodeError as e:
log.warning(f"Could not parse seeksoultools JSON: '{f['fn']}': {e}, skipping sample")
return None, {}
if not isinstance(parsed_json, dict):
return None, {}
s_name = None
# Check if we should use filename instead of parsed sample name
should_use_filename = False
if isinstance(config.use_filename_as_sample_name, list):
# Check for module anchor
if self.anchor in config.use_filename_as_sample_name:
should_use_filename = True
elif config.use_filename_as_sample_name is True:
should_use_filename = True
elif getattr(config, "seeksoultools", {}).get("s_name_filenames", False):
# Deprecated option - warn user
log.warning(
"The 'seeksoultools.s_name_filenames' config option is deprecated. Use the global 'use_filename_as_sample_name' option instead."
)
should_use_filename = True
if should_use_filename:
s_name = f["s_name"]
if s_name is None:
log.warning(
f"Falling back to extracting it from the file name: "
f'"{f["fn"]}" -> "{s_name}"'
)
s_names = []
m = f["s_name"].split('_s')
if m:
s_names.append(m[0])
s_name = self.clean_s_name(s_names, f)
else:
s_name = f["s_name"]
self.add_data_source(f, s_name)
return s_name, parsed_json
获取QC信息
def process_parsed_data(self, parsed_json: Dict, s_name: str):
"""Process the JSON extracted from logs"""
self.seeksoultools_data[s_name] = {}
self.seeksoultools_all_data[s_name] = parsed_json
# Parse base info
try:
self.seeksoultools_data[s_name] = {**parsed_json['stat'], **parsed_json['mapping'], **parsed_json['cells']}
except KeyError:
log.debug(f"seeksoultools JSON did not have 'stat | mapping | cells' key: '{s_name}'")
# Remove empty dicts
if len(self.seeksoultools_data[s_name]) == 0:
del self.seeksoultools_data[s_name]
if len(self.seeksoultools_all_data[s_name]) == 0:
del self.seeksoultools_all_data[s_name]
统计表格
def seeksoultools_general_stats_table(self, data_by_sample):
"""Take the parsed stats from the seeksoultools report and add it to the
General Statistics table at the top of the report"""
self.general_stats_addcols(
data_by_sample,
headers={
"total": {"title": "total_reads"},
"valid": {"title": "valid_reads"},
"trimmed": {"title": "trimmed_reads"},
"B_no_correction": {"title": "barcode_no_correction"},
"B_corrected": {"title": "barcode_correction"},
"chemistry": {},
"Reads Mapped to Genome": {"title": "Mapped_to_Genome", "format": "{:,.5f}"},
"Reads Mapped Confidently to Genome": {"title": "Unique_Mapped_to_Genome", "format": "{:,.5f}"},
"Reads Mapped to Intergenic Regions": {"title": "Mapped_to_Intergenic", "format": "{:,.5f}"},
"Reads Mapped to Intronic Regions": {"title": "Mapped_to_Intronic", "format": "{:,.5f}"},
"Reads Mapped to Exonic Regions": {"title": "Mapped_to_Exonic", "format": "{:,.5f}"},
'Fraction Reads in Cells': {"title": "Reads_in_Cells", "format": "{:,.5f}"},
'Estimated Number of Cells': {"title": "Number_of_Cells"},
'Mean Reads per Cell': {"title": "Mean_Reads_per_Cell"},
'Median Genes per Cell': {"title": "Median_Genes_per_Cell"},
'Median UMI Counts per Cell': {"title": "Median_UMI_Counts_per_Cell"},
'Total Genes Detected': {"title": "Total_Genes_Detected"},
'Sequencing Saturation': {"title": "Sequencing_Saturation", "format": "{:,.5f}"}
},
)
绘制比对条形图
def seeksoultools_mapping_plot(self):
"""Function to generate the seeksoultools mapping reads bar plot"""
# Specify the order of the different possible categories
keys = {
"Reads Mapped to Exonic Regions": {"name": "mapped_to_exon"},
"Reads Mapped to Intronic Regions": {"name": "mapped_to_intron"},
"Reads Mapped to Intergenic Regions": {"name": "mapped_to_intergenic"},
}
# Config for the plot
pconfig = {
"id": "seeksoultools_mapping_bargraph",
"title": "Mapping",
"ylab": "Mapping Reads",
}
return bargraph.plot(self.seeksoultools_data, keys, pconfig)
绘制细胞数量条形图
def seeksoultools_cellnum_plot(self):
"""Function to generate the seeksoultools cellnmum bar plot"""
# Specify the order of the different possible categories
keys = {
"Estimated Number of Cells": {"name": "Number_of_Cells"},
}
# Config for the plot
pconfig = {
"id": "seeksoultools_cellnum_bargraph",
"title": "Cell Number",
"ylab": "Cell Number",
}
return bargraph.plot(self.seeksoultools_data, keys, pconfig)
插件主要是由以上几个函数组成,将插件安装好就可以正常的在MultiQC里面使用了:
multiqc -m seeksoultools -n summary report/*_summary.json


