摸了个鱼之给SeekSoulTools软件写个MultiQC插件

最近分析了一批单细胞数据,前期的比对定量使用公司自己的配套软件SeekSoulTools分析,用起来类似于cellranger一行命令完成定量并生成报告。通常单细胞的定量软件都会生成一个网页报告,可以方便查看一些QC指标。但是,样本的网页报告都是独立的,多个样本查看起来并不方便。这时候就可以用MultiQC方便地汇集多个样本的QC结果。但是,MultiQC原生并不支持该软件,为了方便重复使用,所以利用MultiQC插件功能实现该软件QC结果的汇总。

解析QC源文件

SeekSoulTools会在结果中生成一个json格式的源文件,里面包含所有的QC结果,汇总结果只需解析该文件即可。

def parse_seeksoultools_log(self, f) -> Tuple[Optional[str], Dict]:
    """Parse the JSON output from seeksoultools and save the summary statistics"""
    try:
        parsed_json = json.load(f["f"])
    except json.JSONDecodeError as e:
        log.warning(f"Could not parse seeksoultools JSON: '{f['fn']}': {e}, skipping sample")
        return None, {}
    if not isinstance(parsed_json, dict):
        return None, {}

    s_name = None
    # Check if we should use filename instead of parsed sample name
    should_use_filename = False
    if isinstance(config.use_filename_as_sample_name, list):
        # Check for module anchor
        if self.anchor in config.use_filename_as_sample_name:
            should_use_filename = True
    elif config.use_filename_as_sample_name is True:
        should_use_filename = True
    elif getattr(config, "seeksoultools", {}).get("s_name_filenames", False):
        # Deprecated option - warn user
        log.warning(
            "The 'seeksoultools.s_name_filenames' config option is deprecated. Use the global 'use_filename_as_sample_name' option instead."
        )
        should_use_filename = True

    if should_use_filename:
        s_name = f["s_name"]

    if s_name is None:
        log.warning(
            f"Falling back to extracting it from the file name: "
            f'"{f["fn"]}" -> "{s_name}"'
            )
        s_names = []
        m = f["s_name"].split('_s')
        if m:
            s_names.append(m[0])
            s_name = self.clean_s_name(s_names, f)
        else:
            s_name = f["s_name"]

    self.add_data_source(f, s_name)
    return s_name, parsed_json
获取QC信息
def process_parsed_data(self, parsed_json: Dict, s_name: str):
    """Process the JSON extracted from logs"""

    self.seeksoultools_data[s_name] = {}
    self.seeksoultools_all_data[s_name] = parsed_json

    # Parse base info
    try:
        self.seeksoultools_data[s_name] = {**parsed_json['stat'], **parsed_json['mapping'], **parsed_json['cells']}
    except KeyError:
        log.debug(f"seeksoultools JSON did not have 'stat | mapping | cells' key: '{s_name}'")

    # Remove empty dicts
    if len(self.seeksoultools_data[s_name]) == 0:
        del self.seeksoultools_data[s_name]
    if len(self.seeksoultools_all_data[s_name]) == 0:
        del self.seeksoultools_all_data[s_name]
统计表格
def seeksoultools_general_stats_table(self, data_by_sample):
    """Take the parsed stats from the seeksoultools report and add it to the
    General Statistics table at the top of the report"""

    self.general_stats_addcols(
        data_by_sample,
        headers={
            "total": {"title": "total_reads"},
            "valid": {"title": "valid_reads"},
            "trimmed": {"title": "trimmed_reads"},
            "B_no_correction": {"title": "barcode_no_correction"},
            "B_corrected": {"title": "barcode_correction"},
            "chemistry": {},
            "Reads Mapped to Genome": {"title": "Mapped_to_Genome", "format": "{:,.5f}"},
            "Reads Mapped Confidently to Genome": {"title": "Unique_Mapped_to_Genome", "format": "{:,.5f}"},
            "Reads Mapped to Intergenic Regions": {"title": "Mapped_to_Intergenic", "format": "{:,.5f}"},
            "Reads Mapped to Intronic Regions": {"title": "Mapped_to_Intronic", "format": "{:,.5f}"},
            "Reads Mapped to Exonic Regions": {"title": "Mapped_to_Exonic", "format": "{:,.5f}"},
            'Fraction Reads in Cells': {"title": "Reads_in_Cells", "format": "{:,.5f}"},
            'Estimated Number of Cells': {"title": "Number_of_Cells"},
            'Mean Reads per Cell': {"title": "Mean_Reads_per_Cell"},
            'Median Genes per Cell': {"title": "Median_Genes_per_Cell"},
            'Median UMI Counts per Cell': {"title": "Median_UMI_Counts_per_Cell"},
            'Total Genes Detected': {"title": "Total_Genes_Detected"},
            'Sequencing Saturation': {"title": "Sequencing_Saturation", "format": "{:,.5f}"}
        },
    )
绘制比对条形图
def seeksoultools_mapping_plot(self):
    """Function to generate the seeksoultools mapping reads bar plot"""
    # Specify the order of the different possible categories
    keys = {
        "Reads Mapped to Exonic Regions": {"name": "mapped_to_exon"},
        "Reads Mapped to Intronic Regions": {"name": "mapped_to_intron"},
        "Reads Mapped to Intergenic Regions": {"name": "mapped_to_intergenic"},
    }

    # Config for the plot
    pconfig = {
        "id": "seeksoultools_mapping_bargraph",
        "title": "Mapping",
        "ylab": "Mapping Reads",

    }
    return bargraph.plot(self.seeksoultools_data, keys, pconfig)
绘制细胞数量条形图
def seeksoultools_cellnum_plot(self):
    """Function to generate the seeksoultools cellnmum bar plot"""
    # Specify the order of the different possible categories
    keys = {
        "Estimated Number of Cells": {"name": "Number_of_Cells"},
    }

    # Config for the plot
    pconfig = {
        "id": "seeksoultools_cellnum_bargraph",
        "title": "Cell Number",
        "ylab": "Cell Number",

    }
    return bargraph.plot(self.seeksoultools_data, keys, pconfig)

插件主要是由以上几个函数组成,将插件安装好就可以正常的在MultiQC里面使用了:

multiqc -m seeksoultools -n summary report/*_summary.json
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容