相关性绘图统计

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import os
from matplotlib.gridspec import GridSpec

# -------------------------- 配置参数（可按需修改）--------------------------
input_file = "break.txt"  # Input file path
output_dir = "analysis_results"  # Output folder (auto-created)
desc_table = f"{output_dir}/descriptive_statistics.csv"  # Descriptive stats
diff_table = f"{output_dir}/difference_analysis.csv"    # Difference analysis
fig_size = (10, 8)        # Single plot size
overview_fig_size = (22, 9)  # Overview plot size (wider for facets)
font_size = 10            # Base font size
signif_level = 0.05       # Significance level
treatment_cols = ["3D-GR", "5D-GR", "7D-GR", "3D-SP", "5D-SP", "7D-SP"]  # Treatments
cmap_corr = "RdBu_r"      # Correlation heatmap colormap
cmap_group = "Set2"       # Group color palette
annot_font_size = 8       # 下三角数值字体大小
circle_size_range = (20, 300)  # 上三角圆圈大小范围（最小/最大）
# --------------------------------------------------------------------------

# Initialize: Create output folder
os.makedirs(output_dir, exist_ok=True)

# Set Matplotlib backend (save only, no GUI)
plt.switch_backend('agg')
plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans']  # English font
plt.rcParams['axes.unicode_minus'] = False

# 1. Data loading and preprocessing
def load_data(file_path):
    df = pd.read_csv(file_path, sep=r"\s+", header=0, skip_blank_lines=True)
    
    # Check required columns
    required_cols = ["Background", "Group"] + treatment_cols
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns: {missing_cols}")
    
    # Convert treatment columns to numeric
    for col in treatment_cols:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    
    # Remove rows with NaN
    df_clean = df.dropna(subset=required_cols)
    group_counts = df_clean["Group"].value_counts()
    if len(group_counts) != 2:
        print(f"Warning: Only {len(group_counts)} Group types detected ({group_counts.index.tolist()}). Difference analysis requires 2 groups, will skip!")
    
    print(f"Data preprocessing completed: Original rows={len(df)}, Valid rows={len(df_clean)}")
    print(f"Detected Backgrounds: {df_clean['Background'].unique()}")
    print(f"Detected Groups: {df_clean['Group'].unique()}")
    return df_clean

# 2. Descriptive statistics (grouped by Background+Group)
def descriptive_statistics(df_clean):
    desc_stats = df_clean.groupby(["Background", "Group"])[treatment_cols].agg([
        ("Sample_Count", "count"),
        ("Mean", "mean"),
        ("SD", "std"),
        ("SE", lambda x: x.sem())  # Standard error
    ]).round(2)
    
    # Reshape table structure
    desc_stats = desc_stats.unstack(level="Group").reset_index()
    desc_stats.columns = ["Background"] + [
        f"{col}_{stat}_{group}" 
        for col, stat, group in desc_stats.columns[1:]
    ]
    
    desc_stats.to_csv(desc_table, index=False, encoding="utf-8-sig")
    print(f"\nDescriptive statistics saved to: {desc_table}")
    return desc_stats

# 3. Difference analysis (t-test between Groups within each Background)
def difference_analysis(df_clean):
    diff_results = []
    backgrounds = df_clean["Background"].unique()
    groups = df_clean["Group"].unique()
    if len(groups) != 2:
        return pd.DataFrame()
    
    group1, group2 = groups[0], groups[1]
    
    for bg in backgrounds:
        bg_data = df_clean[df_clean["Background"] == bg]
        data1 = bg_data[bg_data["Group"] == group1][treatment_cols].dropna()
        data2 = bg_data[bg_data["Group"] == group2][treatment_cols].dropna()
        
        for treat in treatment_cols:
            val1 = data1[treat].values
            val2 = data2[treat].values
            if len(val1) < 3 or len(val2) < 3:
                print(f"Warning: Insufficient samples for {bg}-{treat} ({group1}={len(val1)}, {group2}={len(val2)}), skip t-test")
                continue
            
            # T-test (equal variance assumption)
            t_stat, p_val = stats.ttest_ind(val1, val2, equal_var=True)
            mean1 = val1.mean()
            mean2 = val2.mean()
            abs_diff = mean2 - mean1
            rel_diff = (abs_diff / mean1 * 100) if mean1 != 0 else np.nan
            
            # Significance marker
            if p_val < 0.001:
                signif_mark = "***"
            elif p_val < 0.01:
                signif_mark = "**"
            elif p_val < signif_level:
                signif_mark = "*"
            else:
                signif_mark = "ns"
            
            diff_results.append({
                "Background": bg,
                "Treatment": treat,
                f"{group1}_Mean": round(mean1, 2),
                f"{group2}_Mean": round(mean2, 2),
                "Absolute_Difference_(Group2-Group1)": round(abs_diff, 2),
                "Relative_Difference_(%)": round(rel_diff, 2) if not np.isnan(rel_diff) else "NA",
                "t_Statistic": round(t_stat, 3),
                "p_Value": round(p_val, 4),
                "Significance": signif_mark,
                "Group1": group1,
                "Group2": group2
            })
    
    diff_df = pd.DataFrame(diff_results)
    diff_df.to_csv(diff_table, index=False, encoding="utf-8-sig")
    print(f"\nDifference analysis saved to: {diff_table}")
    return diff_df

# 4. Visualization: Individual boxplots (per Background+Group)
def plot_individual_boxplots(df_clean):
    bg_group_pairs = df_clean.groupby(["Background", "Group"]).groups.keys()
    for bg, group in bg_group_pairs:
        plot_data = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group)]
        if len(plot_data) < 1:
            continue
        
        df_melt = pd.melt(
            plot_data,
            id_vars=["Background", "Group"],
            value_vars=treatment_cols,
            var_name="Treatment",
            value_name="Value"
        )
        
        fig, ax = plt.subplots(figsize=fig_size)
        sns.boxplot(
            data=df_melt,
            x="Treatment",
            y="Value",
            ax=ax,
            palette=cmap_group,
            linewidth=1.2,
            flierprops={"marker": "o", "markersize": 4}
        )
        
        # Annotations
        ax.set_title(f"{bg} - {group}", fontsize=font_size + 3, pad=15)
        ax.set_xlabel("Treatment", fontsize=font_size + 1)
        ax.set_ylabel("Value", fontsize=font_size + 1)
        ax.tick_params(axis='x', rotation=45, labelsize=font_size)
        ax.tick_params(axis='y', labelsize=font_size)
        ax.grid(axis="y", alpha=0.3)
        
        # Save plot (replace special chars in Background name)
        safe_bg = bg.replace('/', '_').replace('*', '_').replace(' ', '_')
        img_path = f"{output_dir}/{safe_bg}_{group}_Boxplot.pdf"
        plt.tight_layout()
        plt.savefig(img_path, dpi=300, bbox_inches="tight")
        print(f"Individual boxplot saved: {img_path}")
        plt.close()

# 5. Visualization: Individual correlation heatmaps (per Background+Group, 内部相关)
def plot_individual_correlation_heatmaps(df_clean):
    bg_group_pairs = df_clean.groupby(["Background", "Group"]).groups.keys()
    for bg, group in bg_group_pairs:
        plot_data = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group)][treatment_cols]
        if len(plot_data) < 3:
            print(f"Warning: Insufficient samples for {bg}-{group} ({len(plot_data)} rows), skip correlation heatmap")
            continue
        
        # Calculate correlation matrix and p-values (内部相关)
        corr_matrix = plot_data.corr(method="pearson").round(2)
        p_matrix = np.zeros_like(corr_matrix)
        for i in range(len(corr_matrix.columns)):
            for j in range(len(corr_matrix.columns)):
                if i != j:
                    _, p_matrix[i, j] = stats.pearsonr(plot_data.iloc[:, i], plot_data.iloc[:, j])
        
        # Plot heatmap (内部相关，下三角数值+上三角圆圈+显著性)
        fig, ax = plt.subplots(figsize=fig_size)
        mask_upper = np.triu(np.ones_like(corr_matrix, dtype=bool))  # 上三角掩码
        
        # 绘制热图（下三角显示数值，上三角隐藏数值）
        sns.heatmap(
            corr_matrix,
            ax=ax,
            cmap=cmap_corr,
            center=0,
            annot=True,
            fmt=".2f",
            annot_kws={"size": annot_font_size},
            square=True,
            linewidths=0.8,
            mask=mask_upper,  # 隐藏上三角数值
            cbar_kws={"shrink": 0.9, "label": "Pearson Correlation"}
        )
        
        # 上三角：添加圆圈（大小=相关系数绝对值）+ 显著性标记
        for i in range(len(corr_matrix.index)):
            for j in range(len(corr_matrix.columns)):
                if mask_upper[i, j]:  # 仅上三角
                    corr_val = corr_matrix.iloc[i, j]
                    p_val = p_matrix[i, j]
                    
                    # 圆圈大小归一化（映射到circle_size_range）
                    circle_size = np.interp(abs(corr_val), [0, 1], circle_size_range)
                    
                    # 显著性标记
                    if p_val < 0.001:
                        signif = "***"
                    elif p_val < 0.01:
                        signif = "**"
                    elif p_val < signif_level:
                        signif = "*"
                    else:
                        signif = "ns"
                    
                    # 绘制圆圈（颜色与热图一致）
                    color = plt.cm.RdBu_r((corr_val + 1) / 2)  # 归一化到0-1
                    circle = plt.Circle((j + 0.5, i + 0.5), circle_size/1000, color=color, alpha=0.7)
                    ax.add_patch(circle)
                    
                    # 叠加显著性标记（白色字体，加粗）
                    ax.text(
                        j + 0.5, i + 0.5, signif,
                        ha="center", va="center",
                        fontsize=annot_font_size - 1, fontweight="bold", color="white"
                    )
        
        # Annotations
        ax.set_title(f"{bg} - {group}\nIntra-Group Correlation Matrix", fontsize=font_size + 3, pad=15)
        ax.tick_params(axis='x', rotation=45, labelsize=font_size)
        ax.tick_params(axis='y', rotation=0, labelsize=font_size)
        
        # Save plot
        safe_bg = bg.replace('/', '_').replace('*', '_').replace(' ', '_')
        img_path = f"{output_dir}/{safe_bg}_{group}_Intra_Group_Correlation_Heatmap.pdf"
        plt.tight_layout()
        plt.savefig(img_path, dpi=300, bbox_inches="tight")
        print(f"Individual intra-group correlation heatmap saved: {img_path}")
        plt.close()

# 6. Visualization: Overview boxplot (all Backgrounds+Groups in one figure)
def plot_overview_boxplot(df_clean):
    # Reshape data
    df_melt = pd.melt(
        df_clean,
        id_vars=["Background", "Group"],
        value_vars=treatment_cols,
        var_name="Treatment",
        value_name="Value"
    )
    
    # Create facets by Background
    backgrounds = df_clean["Background"].unique()
    n_cols = len(backgrounds)
    fig, axes = plt.subplots(1, n_cols, figsize=overview_fig_size, sharey=False)
    axes = [axes] if n_cols == 1 else axes
    
    for idx, bg in enumerate(backgrounds):
        bg_data = df_melt[df_melt["Background"] == bg]
        # Plot boxplot (Group as color)
        sns.boxplot(
            data=bg_data,
            x="Treatment",
            y="Value",
            hue="Group",
            ax=axes[idx],
            palette=cmap_group,
            linewidth=1.2,
            flierprops={"marker": "o", "markersize": 3}
        )
        
        # Annotations
        axes[idx].set_title(f"{bg}", fontsize=font_size + 2)
        axes[idx].set_xlabel("Treatment", fontsize=font_size + 1)
        axes[idx].set_ylabel("Value", fontsize=font_size + 1)
        axes[idx].tick_params(axis='x', rotation=45, labelsize=font_size)
        axes[idx].tick_params(axis='y', labelsize=font_size)
        axes[idx].legend(title="Group", loc="upper right", fontsize=font_size - 1)
        axes[idx].grid(axis="y", alpha=0.3)
    
    # Save overview plot
    img_path = f"{output_dir}/Overview_Boxplot.pdf"
    plt.tight_layout()
    plt.savefig(img_path, dpi=300, bbox_inches="tight")
    print(f"Overview boxplot saved: {img_path}")
    plt.close()

# 7. Visualization: Overview difference barplot (all Backgrounds in one figure)
def plot_overview_diff_barplot(diff_df):
    if diff_df.empty:
        print("Skip overview difference barplot: No valid difference data")
        return
    
    # Plot bars (Background as color, Treatment as x-axis)
    fig, ax = plt.subplots(figsize=overview_fig_size)
    sns.barplot(
        data=diff_df,
        x="Treatment",
        y="Absolute_Difference_(Group2-Group1)",
        hue="Background",
        ax=ax,
        palette="tab10",
        alpha=0.8,
        edgecolor="black",
        linewidth=1
    )
    
    # Add significance markers
    for idx, row in diff_df.iterrows():
        x_pos = treatment_cols.index(row["Treatment"])
        y_pos = row["Absolute_Difference_(Group2-Group1)"]
        y_offset = max(diff_df["Absolute_Difference_(Group2-Group1)"]) * 0.02 if y_pos > 0 else min(diff_df["Absolute_Difference_(Group2-Group1)"]) * 0.02
        ax.text(
            x_pos,
            y_pos + y_offset if y_pos > 0 else y_pos - y_offset,
            row["Significance"],
            ha="center",
            va="bottom" if y_pos > 0 else "top",
            fontsize=font_size,
            fontweight="bold"
        )
    
    # Annotations
    group1 = diff_df["Group1"].iloc[0]
    group2 = diff_df["Group2"].iloc[0]
    ax.set_title(f"Absolute Difference Between {group2} and {group1} (All Backgrounds)", fontsize=font_size + 3, pad=15)
    ax.set_xlabel("Treatment", fontsize=font_size + 1)
    ax.set_ylabel(f"Absolute Difference ({group2} - {group1})", fontsize=font_size + 1)
    ax.tick_params(axis='x', rotation=45, labelsize=font_size)
    ax.tick_params(axis='y', labelsize=font_size)
    ax.axhline(y=0, color="black", linestyle="-", linewidth=1)
    ax.grid(axis="y", alpha=0.3)
    ax.legend(title="Background", bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=font_size)
    
    # Save overview plot
    img_path = f"{output_dir}/Overview_Difference_Barplot.pdf"
    plt.tight_layout()
    plt.savefig(img_path, dpi=300, bbox_inches="tight")
    print(f"Overview difference barplot saved: {img_path}")
    plt.close()

# 8. Visualization: Overview inter-group correlation heatmap (下三角数值+上三角圆圈+显著性)
def plot_overview_group_correlation_heatmap(df_clean):
    backgrounds = df_clean["Background"].unique()
    groups = df_clean["Group"].unique()
    if len(groups) != 2:
        print("Skip overview group correlation heatmap: Need 2 Group types for inter-group correlation")
        return
    
    group1, group2 = groups[0], groups[1]
    n_backgrounds = len(backgrounds)
    
    # Create grid: 1 row × n_backgrounds columns + 1 colorbar column
    fig = plt.figure(figsize=overview_fig_size)
    gs = GridSpec(1, n_backgrounds + 1, figure=fig, width_ratios=[1]*n_backgrounds + [0.1])
    
    # Shared colorbar axis
    cbar_ax = fig.add_subplot(gs[:, -1])
    
    # Plot each Background's inter-group correlation heatmap
    for bg_idx, bg in enumerate(backgrounds):
        # Extract and align data for two Groups
        data_g1 = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group1)][treatment_cols].dropna()
        data_g2 = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group2)][treatment_cols].dropna()
        min_samples = min(len(data_g1), len(data_g2))
        
        if min_samples < 3:
            print(f"Warning: Insufficient paired samples for {bg} (min samples={min_samples}), skip in overview group correlation")
            continue
        
        data_g1_aligned = data_g1.iloc[:min_samples].reset_index(drop=True)
        data_g2_aligned = data_g2.iloc[:min_samples].reset_index(drop=True)
        
        # Calculate inter-group correlation matrix and p-value matrix
        corr_matrix = pd.DataFrame(0.0, index=treatment_cols, columns=treatment_cols)
        p_matrix = pd.DataFrame(1.0, index=treatment_cols, columns=treatment_cols)
        
        for i, treat1 in enumerate(treatment_cols):
            for j, treat2 in enumerate(treatment_cols):
                corr_coef, p_val = stats.pearsonr(data_g1_aligned[treat1], data_g2_aligned[treat2])
                corr_matrix.loc[treat1, treat2] = round(corr_coef, 2)
                p_matrix.loc[treat1, treat2] = p_val
        
        # Create axis for current Background
        ax = fig.add_subplot(gs[:, bg_idx])
        
        # 定义上三角掩码（只隐藏上三角的数值，不隐藏颜色）
        mask_upper = np.triu(np.ones_like(corr_matrix, dtype=bool))
        
        # 第一步：绘制完整热图（颜色覆盖全矩阵，下三角显示数值，上三角隐藏数值）
        sns.heatmap(
            corr_matrix,
            ax=ax,
            cmap=cmap_corr,
            center=0,
            annot=True,
            fmt=".2f",
            annot_kws={"size": annot_font_size, "fontweight": "bold"},
            square=True,
            linewidths=0.8,
            mask=mask_upper,  # 仅隐藏上三角的数值标注
            cbar=bg_idx == 0,  # 仅第一个Background创建颜色条
            cbar_ax=cbar_ax if bg_idx == 0 else None,
            cbar_kws={"label": "Pearson Correlation (Inter-Group)"} if bg_idx == 0 else None
        )
        
        # 第二步：上三角添加圆圈（大小=相关系数绝对值）+ 显著性标记
        for i in range(len(corr_matrix.index)):
            for j in range(len(corr_matrix.columns)):
                if mask_upper[i, j]:  # 仅上三角区域
                    corr_val = corr_matrix.iloc[i, j]
                    p_val = p_matrix.iloc[i, j]
                    
                    # 圆圈大小归一化（0→最小圈，1→最大圈）
                    circle_size = np.interp(abs(corr_val), [0, 1], circle_size_range)
                    
                    # 显著性标记（根据p值）
                    if p_val < 0.001:
                        signif_mark = "***"
                    elif p_val < 0.01:
                        signif_mark = "**"
                    elif p_val < signif_level:
                        signif_mark = "*"
                    else:
                        signif_mark = "ns"
                    
                    # 绘制圆圈（颜色与热图一致，半透明）
                    color = plt.cm.RdBu_r((corr_val + 1) / 2)  # 归一化相关系数到0-1（适配colormap）
                    circle = plt.Circle(
                        (j + 0.5, i + 0.5),  # 单元格中心坐标
                        circle_size / 1000,   # 圆圈半径（缩放因子1000，适配坐标轴）
                        color=color,
                        alpha=0.8,
                        edgecolor="black",
                        linewidth=0.5
                    )
                    ax.add_patch(circle)
                    
                    # 叠加显著性标记（白色粗体，居中）
                    ax.text(
                        j + 0.5, i + 0.5,
                        signif_mark,
                        ha="center", va="center",
                        fontsize=annot_font_size - 1,
                        fontweight="bold",
                        color="white"
                    )
        
        # 第三步：设置轴标签和标题
        ax.set_title(
            f"{bg}\nInter-Group Correlation\n({group1} × {group2})",
            fontsize=font_size + 1,
            pad=12,
            fontweight="bold"
        )
        ax.set_xlabel(f"{group2} Treatments", fontsize=font_size, fontweight="bold")
        ax.set_ylabel(f"{group1} Treatments", fontsize=font_size, fontweight="bold")
        ax.tick_params(axis='x', rotation=45, labelsize=annot_font_size)
        ax.tick_params(axis='y', rotation=0, labelsize=annot_font_size)
    
    # 调整颜色条标签字体
    cbar_ax.yaxis.label.set_fontsize(font_size + 1)
    cbar_ax.yaxis.label.set_fontweight("bold")
    cbar_ax.tick_params(labelsize=font_size)
    
    # 保存总热图
    img_path = f"{output_dir}/Overview_Inter_Group_Correlation_Heatmap.pdf"
    plt.tight_layout()
    plt.subplots_adjust(top=0.85, wspace=0.4)  # 预留标题空间
    plt.savefig(img_path, dpi=300, bbox_inches="tight")
    print(f"Overview inter-group correlation heatmap saved: {img_path}")
    plt.close()

# 9. Main function
if __name__ == "__main__":
    try:
        # Step 1: Load data
        df_clean = load_data(input_file)
        
        # Step 2: Descriptive statistics
        desc_stats = descriptive_statistics(df_clean)
        
        # Step 3: Difference analysis
        diff_df = difference_analysis(df_clean)
        
        # Step 4: Individual plots (boxplots + intra-group correlation heatmaps)
        plot_individual_boxplots(df_clean)
        plot_individual_correlation_heatmaps(df_clean)
        
        # Step 5: Overview plots
        plot_overview_boxplot(df_clean)
        if not diff_df.empty:
            plot_overview_diff_barplot(diff_df)
        plot_overview_group_correlation_heatmap(df_clean)  # 核心：Group间总热图（下三角数值+上三角圆圈+显著性）
        
        print(f"\n=== All analysis completed! Results saved to: {output_dir} ===")
        print("Generated files:")
        print("1. Statistical Tables: descriptive_statistics.csv, difference_analysis.csv")
        print("2. Individual Plots:")
        print("   - Boxplots (per Background+Group)")
        print("   - Intra-Group Correlation Heatmaps (per Background+Group, 下三角数值+上三角圆圈+显著性)")
        print("3. Overview Plots:")
        print("   - Overview_Boxplot.pdf (Global boxplot comparison)")
        print("   - Overview_Difference_Barplot.pdf (Global difference comparison)")
        print("   - Overview_Inter_Group_Correlation_Heatmap.pdf (Global inter-group correlation, 下三角数值+上三角圆圈+显著性)")
    
    except Exception as e:
        print(f"Error occurred: {str(e)}")
相关性绘图统计

相关性绘图统计

相关阅读更多精彩内容

友情链接更多精彩内容