用上一篇生成的merged_gene_counts.txt的基因raw counts文件进行差异分析,python环境,利用DESeq2工具。
- 安装必要的库
pip install rpy2 pandas numpy matplotlib seaborn adjustText
- 读取数据,构建实验分组信息
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from rpy2.robjects import pandas2ri
import rpy2.robjects as robjects
# 启用 pandas 和 R 之间的数据转换
pandas2ri.activate()
# 读取基因表达计数文件
df_gene = pd.read_csv("merged_gene_counts.txt", sep="\t", index_col=0)
# 查看数据格式
print(df_gene.head())
print(df_gene.shape) # 基因数 x 样本数
# 构建实验设计信息(metadata)
metadata = pd.DataFrame({
"sample": ["BC07", "BC08", "BC07_2", "BC08_2"], # 样本名称
"condition": ["input", "IP", "input", "IP"] # input vs IP
})
metadata.set_index("sample", inplace=True)
# 保存 metadata
metadata.to_csv("metadata.txt", sep="\t")
print(metadata)
返回:
BC07 BC08 BC07_2 BC08_2
Gene_id_trimmed
Gm37329 0 1 0 0
Gm38148 0 0 0 2
Gm19938 0 0 0 1
ENSMUST00000371377 0 0 1 0
ENSMUST00000371375 0 0 1 0
(26662, 4) # 4个样品,26662个基因
condition
sample
BC07 input
BC08 IP
BC07_2 input
BC08_2 IP
- 运行 R 代码进行 DESeq2 差异分析
# 运行 R 代码
robjects.r('''
library(DESeq2)
# 读取基因表达数据
counts <- read.csv("merged_gene_counts.txt", sep="\t", row.names=1, header=TRUE)
metadata <- read.csv("metadata.txt", sep="\t", row.names=1, header=TRUE)
# 确保样本名称匹配
rownames(metadata) <- colnames(counts)
# 转换 condition 为因子,并设置 input 作为参考水平
metadata$condition <- factor(metadata$condition)
metadata$condition <- relevel(metadata$condition, ref="input")
# 创建 DESeq2 数据集
dds <- DESeqDataSetFromMatrix(countData=counts,
colData=metadata,
design=~condition)
# 运行 DESeq2 进行差异表达分析
dds <- DESeq(dds)
# 获取分析结果
res <- results(dds)
# 过滤显著差异基因 (p值 < 0.05 & |log2FoldChange| > 1)
res_sig <- subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
# 保存分析结果
write.csv(as.data.frame(res), file="DESeq2_results.csv")
write.csv(as.data.frame(res_sig), file="DESeq2_significant_results.csv")
''')
print("✅ 差异表达分析完成!结果已保存:'DESeq2_results.csv' 和 'DESeq2_significant_results.csv'")
- 确认分组信息,看上调下调的方向
import pandas as pd
# 读取 metadata
metadata = pd.read_csv("metadata.txt", sep="\t", index_col=0)
# 查看 unique condition levels
print(metadata["condition"].unique())
返回:
['input' 'IP']
#说明 DESeq2 计算的是 IP / input(即 logFC > 1 代表 IP 组上调,logFC < -1 代表 input 组上调)
- 读取 DESeq2 结果
# 读取分析结果
df_de = pd.read_csv("DESeq2_results.csv", index_col=0)
df_de_sig = pd.read_csv("DESeq2_significant_results.csv", index_col=0)
# 查看结果
print(df_de_sig.head())
print(f"显著差异基因数: {df_de_sig.shape[0]}")
print(df_de[df_de["padj"] == 0])
返回:
baseMean log2FoldChange lfcSE stat pvalue \
Rps2 476.579868 -1.627666 0.243576 -6.682381 2.350913e-11
Snhg6 42.222599 -2.000645 0.666035 -3.003815 2.666170e-03
Prex2 125.954966 -1.737477 0.377972 -4.596847 4.289333e-06
Ncoa2 18.505325 6.107599 1.668580 3.660358 2.518630e-04
Rpl7 134.726136 -1.760752 0.369421 -4.766253 1.876838e-06
padj
Rps2 1.485625e-09
Snhg6 2.097601e-02
Prex2 8.939153e-05
Ncoa2 3.048780e-03
Rpl7 4.280184e-05
显著差异基因数: 626
Empty DataFrame
Columns: [baseMean, log2FoldChange, lfcSE, stat, pvalue, padj]
Index: []
- 绘制火山图
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from adjustText import adjust_text # 自动调整标签位置
plt.figure(figsize=(8, 6))
# 过滤掉 padj 为 0 或 NaN 的值,避免 -log10(0) 变成 inf
df_de = df_de.dropna(subset=["log2FoldChange", "padj"]) # 去掉 NaN
df_de = df_de[df_de["padj"] > 0] # 避免 log10(0) 导致无穷大
# 定义显著基因的布尔掩码
df_de["significant"] = (df_de["padj"] < 0.05) & (abs(df_de["log2FoldChange"]) > 1)
# 额外筛选高置信的基因(更严格)
df_de["highlight"] = (df_de["padj"] < 0.01) & (abs(df_de["log2FoldChange"]) > 2)
# 绘制散点图
sns.scatterplot(
data=df_de, x="log2FoldChange", y=-np.log10(df_de["padj"]),
hue=df_de["significant"], # 直接使用布尔列作为 hue
palette={True: "red", False: "gray"},
alpha=0.6
)
# 添加阈值线
plt.axvline(x=-1, linestyle="--", color="black", alpha=0.7)
plt.axvline(x=1, linestyle="--", color="black", alpha=0.7)
plt.axhline(y=-np.log10(0.05), linestyle="--", color="black", alpha=0.7)
# **修正选择 logFC 绝对值最大 5 个和 padj 最小 10 个**
top_left = df_de[df_de["log2FoldChange"] < 0].nsmallest(10, "padj") # 负向 padj 最小 10 个
top_right = df_de[df_de["log2FoldChange"] > 0].nsmallest(10, "padj") # 正向 padj 最小 10 个
logfc_left = df_de[df_de["log2FoldChange"] < 0].nsmallest(5, "log2FoldChange") # 负向 logFC 绝对值最大 5 个(更负)
logfc_right = df_de[df_de["log2FoldChange"] > 0].nlargest(5, "log2FoldChange") # 正向 logFC 绝对值最大 5 个
# 合并这些基因,并去重
selected_genes = pd.concat([top_left, top_right, logfc_left, logfc_right]).drop_duplicates()
# **过滤 NaN 避免 plt.text() 出错**
selected_genes = selected_genes.dropna(subset=["log2FoldChange", "padj"])
# **添加基因名称标签,避免重叠**
texts = []
for i, row in selected_genes.iterrows():
texts.append(plt.text(row["log2FoldChange"], -np.log10(row["padj"]), row.name,
fontsize=8, ha='right', va='bottom'))
# **使用 adjustText 自动调整文本位置,避免重叠**
adjust_text(texts, arrowprops=dict(arrowstyle="-", color='black', lw=0.5))
# 设置标签和标题
plt.xlabel("log2 Fold Change")
plt.ylabel("-log10(p-adjusted)")
plt.title("Volcano Plot of Differential Expression")
# 自动匹配正确颜色的图例
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles, ["Non-significant", "Significant"], title="Significant")
plt.show()
如何美化火山图,之后再研究吧。