import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import os
from matplotlib.gridspec import GridSpec
# -------------------------- 配置参数(可按需修改)--------------------------
input_file = "break.txt" # Input file path
output_dir = "analysis_results" # Output folder (auto-created)
desc_table = f"{output_dir}/descriptive_statistics.csv" # Descriptive stats
diff_table = f"{output_dir}/difference_analysis.csv" # Difference analysis
fig_size = (10, 8) # Single plot size
overview_fig_size = (22, 9) # Overview plot size (wider for facets)
font_size = 10 # Base font size
signif_level = 0.05 # Significance level
treatment_cols = ["3D-GR", "5D-GR", "7D-GR", "3D-SP", "5D-SP", "7D-SP"] # Treatments
cmap_corr = "RdBu_r" # Correlation heatmap colormap
cmap_group = "Set2" # Group color palette
annot_font_size = 8 # 下三角数值字体大小
circle_size_range = (20, 300) # 上三角圆圈大小范围(最小/最大)
# --------------------------------------------------------------------------
# Initialize: Create output folder
os.makedirs(output_dir, exist_ok=True)
# Set Matplotlib backend (save only, no GUI)
plt.switch_backend('agg')
plt.rcParams['font.sans-serif'] = ['Arial', 'DejaVu Sans'] # English font
plt.rcParams['axes.unicode_minus'] = False
# 1. Data loading and preprocessing
def load_data(file_path):
df = pd.read_csv(file_path, sep=r"\s+", header=0, skip_blank_lines=True)
# Check required columns
required_cols = ["Background", "Group"] + treatment_cols
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")
# Convert treatment columns to numeric
for col in treatment_cols:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Remove rows with NaN
df_clean = df.dropna(subset=required_cols)
group_counts = df_clean["Group"].value_counts()
if len(group_counts) != 2:
print(f"Warning: Only {len(group_counts)} Group types detected ({group_counts.index.tolist()}). Difference analysis requires 2 groups, will skip!")
print(f"Data preprocessing completed: Original rows={len(df)}, Valid rows={len(df_clean)}")
print(f"Detected Backgrounds: {df_clean['Background'].unique()}")
print(f"Detected Groups: {df_clean['Group'].unique()}")
return df_clean
# 2. Descriptive statistics (grouped by Background+Group)
def descriptive_statistics(df_clean):
desc_stats = df_clean.groupby(["Background", "Group"])[treatment_cols].agg([
("Sample_Count", "count"),
("Mean", "mean"),
("SD", "std"),
("SE", lambda x: x.sem()) # Standard error
]).round(2)
# Reshape table structure
desc_stats = desc_stats.unstack(level="Group").reset_index()
desc_stats.columns = ["Background"] + [
f"{col}_{stat}_{group}"
for col, stat, group in desc_stats.columns[1:]
]
desc_stats.to_csv(desc_table, index=False, encoding="utf-8-sig")
print(f"\nDescriptive statistics saved to: {desc_table}")
return desc_stats
# 3. Difference analysis (t-test between Groups within each Background)
def difference_analysis(df_clean):
diff_results = []
backgrounds = df_clean["Background"].unique()
groups = df_clean["Group"].unique()
if len(groups) != 2:
return pd.DataFrame()
group1, group2 = groups[0], groups[1]
for bg in backgrounds:
bg_data = df_clean[df_clean["Background"] == bg]
data1 = bg_data[bg_data["Group"] == group1][treatment_cols].dropna()
data2 = bg_data[bg_data["Group"] == group2][treatment_cols].dropna()
for treat in treatment_cols:
val1 = data1[treat].values
val2 = data2[treat].values
if len(val1) < 3 or len(val2) < 3:
print(f"Warning: Insufficient samples for {bg}-{treat} ({group1}={len(val1)}, {group2}={len(val2)}), skip t-test")
continue
# T-test (equal variance assumption)
t_stat, p_val = stats.ttest_ind(val1, val2, equal_var=True)
mean1 = val1.mean()
mean2 = val2.mean()
abs_diff = mean2 - mean1
rel_diff = (abs_diff / mean1 * 100) if mean1 != 0 else np.nan
# Significance marker
if p_val < 0.001:
signif_mark = "***"
elif p_val < 0.01:
signif_mark = "**"
elif p_val < signif_level:
signif_mark = "*"
else:
signif_mark = "ns"
diff_results.append({
"Background": bg,
"Treatment": treat,
f"{group1}_Mean": round(mean1, 2),
f"{group2}_Mean": round(mean2, 2),
"Absolute_Difference_(Group2-Group1)": round(abs_diff, 2),
"Relative_Difference_(%)": round(rel_diff, 2) if not np.isnan(rel_diff) else "NA",
"t_Statistic": round(t_stat, 3),
"p_Value": round(p_val, 4),
"Significance": signif_mark,
"Group1": group1,
"Group2": group2
})
diff_df = pd.DataFrame(diff_results)
diff_df.to_csv(diff_table, index=False, encoding="utf-8-sig")
print(f"\nDifference analysis saved to: {diff_table}")
return diff_df
# 4. Visualization: Individual boxplots (per Background+Group)
def plot_individual_boxplots(df_clean):
bg_group_pairs = df_clean.groupby(["Background", "Group"]).groups.keys()
for bg, group in bg_group_pairs:
plot_data = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group)]
if len(plot_data) < 1:
continue
df_melt = pd.melt(
plot_data,
id_vars=["Background", "Group"],
value_vars=treatment_cols,
var_name="Treatment",
value_name="Value"
)
fig, ax = plt.subplots(figsize=fig_size)
sns.boxplot(
data=df_melt,
x="Treatment",
y="Value",
ax=ax,
palette=cmap_group,
linewidth=1.2,
flierprops={"marker": "o", "markersize": 4}
)
# Annotations
ax.set_title(f"{bg} - {group}", fontsize=font_size + 3, pad=15)
ax.set_xlabel("Treatment", fontsize=font_size + 1)
ax.set_ylabel("Value", fontsize=font_size + 1)
ax.tick_params(axis='x', rotation=45, labelsize=font_size)
ax.tick_params(axis='y', labelsize=font_size)
ax.grid(axis="y", alpha=0.3)
# Save plot (replace special chars in Background name)
safe_bg = bg.replace('/', '_').replace('*', '_').replace(' ', '_')
img_path = f"{output_dir}/{safe_bg}_{group}_Boxplot.pdf"
plt.tight_layout()
plt.savefig(img_path, dpi=300, bbox_inches="tight")
print(f"Individual boxplot saved: {img_path}")
plt.close()
# 5. Visualization: Individual correlation heatmaps (per Background+Group, 内部相关)
def plot_individual_correlation_heatmaps(df_clean):
bg_group_pairs = df_clean.groupby(["Background", "Group"]).groups.keys()
for bg, group in bg_group_pairs:
plot_data = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group)][treatment_cols]
if len(plot_data) < 3:
print(f"Warning: Insufficient samples for {bg}-{group} ({len(plot_data)} rows), skip correlation heatmap")
continue
# Calculate correlation matrix and p-values (内部相关)
corr_matrix = plot_data.corr(method="pearson").round(2)
p_matrix = np.zeros_like(corr_matrix)
for i in range(len(corr_matrix.columns)):
for j in range(len(corr_matrix.columns)):
if i != j:
_, p_matrix[i, j] = stats.pearsonr(plot_data.iloc[:, i], plot_data.iloc[:, j])
# Plot heatmap (内部相关,下三角数值+上三角圆圈+显著性)
fig, ax = plt.subplots(figsize=fig_size)
mask_upper = np.triu(np.ones_like(corr_matrix, dtype=bool)) # 上三角掩码
# 绘制热图(下三角显示数值,上三角隐藏数值)
sns.heatmap(
corr_matrix,
ax=ax,
cmap=cmap_corr,
center=0,
annot=True,
fmt=".2f",
annot_kws={"size": annot_font_size},
square=True,
linewidths=0.8,
mask=mask_upper, # 隐藏上三角数值
cbar_kws={"shrink": 0.9, "label": "Pearson Correlation"}
)
# 上三角:添加圆圈(大小=相关系数绝对值)+ 显著性标记
for i in range(len(corr_matrix.index)):
for j in range(len(corr_matrix.columns)):
if mask_upper[i, j]: # 仅上三角
corr_val = corr_matrix.iloc[i, j]
p_val = p_matrix[i, j]
# 圆圈大小归一化(映射到circle_size_range)
circle_size = np.interp(abs(corr_val), [0, 1], circle_size_range)
# 显著性标记
if p_val < 0.001:
signif = "***"
elif p_val < 0.01:
signif = "**"
elif p_val < signif_level:
signif = "*"
else:
signif = "ns"
# 绘制圆圈(颜色与热图一致)
color = plt.cm.RdBu_r((corr_val + 1) / 2) # 归一化到0-1
circle = plt.Circle((j + 0.5, i + 0.5), circle_size/1000, color=color, alpha=0.7)
ax.add_patch(circle)
# 叠加显著性标记(白色字体,加粗)
ax.text(
j + 0.5, i + 0.5, signif,
ha="center", va="center",
fontsize=annot_font_size - 1, fontweight="bold", color="white"
)
# Annotations
ax.set_title(f"{bg} - {group}\nIntra-Group Correlation Matrix", fontsize=font_size + 3, pad=15)
ax.tick_params(axis='x', rotation=45, labelsize=font_size)
ax.tick_params(axis='y', rotation=0, labelsize=font_size)
# Save plot
safe_bg = bg.replace('/', '_').replace('*', '_').replace(' ', '_')
img_path = f"{output_dir}/{safe_bg}_{group}_Intra_Group_Correlation_Heatmap.pdf"
plt.tight_layout()
plt.savefig(img_path, dpi=300, bbox_inches="tight")
print(f"Individual intra-group correlation heatmap saved: {img_path}")
plt.close()
# 6. Visualization: Overview boxplot (all Backgrounds+Groups in one figure)
def plot_overview_boxplot(df_clean):
# Reshape data
df_melt = pd.melt(
df_clean,
id_vars=["Background", "Group"],
value_vars=treatment_cols,
var_name="Treatment",
value_name="Value"
)
# Create facets by Background
backgrounds = df_clean["Background"].unique()
n_cols = len(backgrounds)
fig, axes = plt.subplots(1, n_cols, figsize=overview_fig_size, sharey=False)
axes = [axes] if n_cols == 1 else axes
for idx, bg in enumerate(backgrounds):
bg_data = df_melt[df_melt["Background"] == bg]
# Plot boxplot (Group as color)
sns.boxplot(
data=bg_data,
x="Treatment",
y="Value",
hue="Group",
ax=axes[idx],
palette=cmap_group,
linewidth=1.2,
flierprops={"marker": "o", "markersize": 3}
)
# Annotations
axes[idx].set_title(f"{bg}", fontsize=font_size + 2)
axes[idx].set_xlabel("Treatment", fontsize=font_size + 1)
axes[idx].set_ylabel("Value", fontsize=font_size + 1)
axes[idx].tick_params(axis='x', rotation=45, labelsize=font_size)
axes[idx].tick_params(axis='y', labelsize=font_size)
axes[idx].legend(title="Group", loc="upper right", fontsize=font_size - 1)
axes[idx].grid(axis="y", alpha=0.3)
# Save overview plot
img_path = f"{output_dir}/Overview_Boxplot.pdf"
plt.tight_layout()
plt.savefig(img_path, dpi=300, bbox_inches="tight")
print(f"Overview boxplot saved: {img_path}")
plt.close()
# 7. Visualization: Overview difference barplot (all Backgrounds in one figure)
def plot_overview_diff_barplot(diff_df):
if diff_df.empty:
print("Skip overview difference barplot: No valid difference data")
return
# Plot bars (Background as color, Treatment as x-axis)
fig, ax = plt.subplots(figsize=overview_fig_size)
sns.barplot(
data=diff_df,
x="Treatment",
y="Absolute_Difference_(Group2-Group1)",
hue="Background",
ax=ax,
palette="tab10",
alpha=0.8,
edgecolor="black",
linewidth=1
)
# Add significance markers
for idx, row in diff_df.iterrows():
x_pos = treatment_cols.index(row["Treatment"])
y_pos = row["Absolute_Difference_(Group2-Group1)"]
y_offset = max(diff_df["Absolute_Difference_(Group2-Group1)"]) * 0.02 if y_pos > 0 else min(diff_df["Absolute_Difference_(Group2-Group1)"]) * 0.02
ax.text(
x_pos,
y_pos + y_offset if y_pos > 0 else y_pos - y_offset,
row["Significance"],
ha="center",
va="bottom" if y_pos > 0 else "top",
fontsize=font_size,
fontweight="bold"
)
# Annotations
group1 = diff_df["Group1"].iloc[0]
group2 = diff_df["Group2"].iloc[0]
ax.set_title(f"Absolute Difference Between {group2} and {group1} (All Backgrounds)", fontsize=font_size + 3, pad=15)
ax.set_xlabel("Treatment", fontsize=font_size + 1)
ax.set_ylabel(f"Absolute Difference ({group2} - {group1})", fontsize=font_size + 1)
ax.tick_params(axis='x', rotation=45, labelsize=font_size)
ax.tick_params(axis='y', labelsize=font_size)
ax.axhline(y=0, color="black", linestyle="-", linewidth=1)
ax.grid(axis="y", alpha=0.3)
ax.legend(title="Background", bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=font_size)
# Save overview plot
img_path = f"{output_dir}/Overview_Difference_Barplot.pdf"
plt.tight_layout()
plt.savefig(img_path, dpi=300, bbox_inches="tight")
print(f"Overview difference barplot saved: {img_path}")
plt.close()
# 8. Visualization: Overview inter-group correlation heatmap (下三角数值+上三角圆圈+显著性)
def plot_overview_group_correlation_heatmap(df_clean):
backgrounds = df_clean["Background"].unique()
groups = df_clean["Group"].unique()
if len(groups) != 2:
print("Skip overview group correlation heatmap: Need 2 Group types for inter-group correlation")
return
group1, group2 = groups[0], groups[1]
n_backgrounds = len(backgrounds)
# Create grid: 1 row × n_backgrounds columns + 1 colorbar column
fig = plt.figure(figsize=overview_fig_size)
gs = GridSpec(1, n_backgrounds + 1, figure=fig, width_ratios=[1]*n_backgrounds + [0.1])
# Shared colorbar axis
cbar_ax = fig.add_subplot(gs[:, -1])
# Plot each Background's inter-group correlation heatmap
for bg_idx, bg in enumerate(backgrounds):
# Extract and align data for two Groups
data_g1 = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group1)][treatment_cols].dropna()
data_g2 = df_clean[(df_clean["Background"] == bg) & (df_clean["Group"] == group2)][treatment_cols].dropna()
min_samples = min(len(data_g1), len(data_g2))
if min_samples < 3:
print(f"Warning: Insufficient paired samples for {bg} (min samples={min_samples}), skip in overview group correlation")
continue
data_g1_aligned = data_g1.iloc[:min_samples].reset_index(drop=True)
data_g2_aligned = data_g2.iloc[:min_samples].reset_index(drop=True)
# Calculate inter-group correlation matrix and p-value matrix
corr_matrix = pd.DataFrame(0.0, index=treatment_cols, columns=treatment_cols)
p_matrix = pd.DataFrame(1.0, index=treatment_cols, columns=treatment_cols)
for i, treat1 in enumerate(treatment_cols):
for j, treat2 in enumerate(treatment_cols):
corr_coef, p_val = stats.pearsonr(data_g1_aligned[treat1], data_g2_aligned[treat2])
corr_matrix.loc[treat1, treat2] = round(corr_coef, 2)
p_matrix.loc[treat1, treat2] = p_val
# Create axis for current Background
ax = fig.add_subplot(gs[:, bg_idx])
# 定义上三角掩码(只隐藏上三角的数值,不隐藏颜色)
mask_upper = np.triu(np.ones_like(corr_matrix, dtype=bool))
# 第一步:绘制完整热图(颜色覆盖全矩阵,下三角显示数值,上三角隐藏数值)
sns.heatmap(
corr_matrix,
ax=ax,
cmap=cmap_corr,
center=0,
annot=True,
fmt=".2f",
annot_kws={"size": annot_font_size, "fontweight": "bold"},
square=True,
linewidths=0.8,
mask=mask_upper, # 仅隐藏上三角的数值标注
cbar=bg_idx == 0, # 仅第一个Background创建颜色条
cbar_ax=cbar_ax if bg_idx == 0 else None,
cbar_kws={"label": "Pearson Correlation (Inter-Group)"} if bg_idx == 0 else None
)
# 第二步:上三角添加圆圈(大小=相关系数绝对值)+ 显著性标记
for i in range(len(corr_matrix.index)):
for j in range(len(corr_matrix.columns)):
if mask_upper[i, j]: # 仅上三角区域
corr_val = corr_matrix.iloc[i, j]
p_val = p_matrix.iloc[i, j]
# 圆圈大小归一化(0→最小圈,1→最大圈)
circle_size = np.interp(abs(corr_val), [0, 1], circle_size_range)
# 显著性标记(根据p值)
if p_val < 0.001:
signif_mark = "***"
elif p_val < 0.01:
signif_mark = "**"
elif p_val < signif_level:
signif_mark = "*"
else:
signif_mark = "ns"
# 绘制圆圈(颜色与热图一致,半透明)
color = plt.cm.RdBu_r((corr_val + 1) / 2) # 归一化相关系数到0-1(适配colormap)
circle = plt.Circle(
(j + 0.5, i + 0.5), # 单元格中心坐标
circle_size / 1000, # 圆圈半径(缩放因子1000,适配坐标轴)
color=color,
alpha=0.8,
edgecolor="black",
linewidth=0.5
)
ax.add_patch(circle)
# 叠加显著性标记(白色粗体,居中)
ax.text(
j + 0.5, i + 0.5,
signif_mark,
ha="center", va="center",
fontsize=annot_font_size - 1,
fontweight="bold",
color="white"
)
# 第三步:设置轴标签和标题
ax.set_title(
f"{bg}\nInter-Group Correlation\n({group1} × {group2})",
fontsize=font_size + 1,
pad=12,
fontweight="bold"
)
ax.set_xlabel(f"{group2} Treatments", fontsize=font_size, fontweight="bold")
ax.set_ylabel(f"{group1} Treatments", fontsize=font_size, fontweight="bold")
ax.tick_params(axis='x', rotation=45, labelsize=annot_font_size)
ax.tick_params(axis='y', rotation=0, labelsize=annot_font_size)
# 调整颜色条标签字体
cbar_ax.yaxis.label.set_fontsize(font_size + 1)
cbar_ax.yaxis.label.set_fontweight("bold")
cbar_ax.tick_params(labelsize=font_size)
# 保存总热图
img_path = f"{output_dir}/Overview_Inter_Group_Correlation_Heatmap.pdf"
plt.tight_layout()
plt.subplots_adjust(top=0.85, wspace=0.4) # 预留标题空间
plt.savefig(img_path, dpi=300, bbox_inches="tight")
print(f"Overview inter-group correlation heatmap saved: {img_path}")
plt.close()
# 9. Main function
if __name__ == "__main__":
try:
# Step 1: Load data
df_clean = load_data(input_file)
# Step 2: Descriptive statistics
desc_stats = descriptive_statistics(df_clean)
# Step 3: Difference analysis
diff_df = difference_analysis(df_clean)
# Step 4: Individual plots (boxplots + intra-group correlation heatmaps)
plot_individual_boxplots(df_clean)
plot_individual_correlation_heatmaps(df_clean)
# Step 5: Overview plots
plot_overview_boxplot(df_clean)
if not diff_df.empty:
plot_overview_diff_barplot(diff_df)
plot_overview_group_correlation_heatmap(df_clean) # 核心:Group间总热图(下三角数值+上三角圆圈+显著性)
print(f"\n=== All analysis completed! Results saved to: {output_dir} ===")
print("Generated files:")
print("1. Statistical Tables: descriptive_statistics.csv, difference_analysis.csv")
print("2. Individual Plots:")
print(" - Boxplots (per Background+Group)")
print(" - Intra-Group Correlation Heatmaps (per Background+Group, 下三角数值+上三角圆圈+显著性)")
print("3. Overview Plots:")
print(" - Overview_Boxplot.pdf (Global boxplot comparison)")
print(" - Overview_Difference_Barplot.pdf (Global difference comparison)")
print(" - Overview_Inter_Group_Correlation_Heatmap.pdf (Global inter-group correlation, 下三角数值+上三角圆圈+显著性)")
except Exception as e:
print(f"Error occurred: {str(e)}")