Python 绘制 CRISPR sgRNA Counts均一度 分布曲线图
示例结果
实现步骤:
- 读取Mageck分析后的Gene Counts表格,去表头
- 计算累计密度
- 计算分位数和均一度
- 添加分位数标记线(保持中文SimHei)
- 添加差值标注(保持中文)
- 绘制图形并保存
- 打印结果(10%,90%分位数,差值,均一度等)
全部代码如下:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os,sys
os.chdir(sys.path[0])
def read_and_plot_cumsum(file_path, sheet_name, column_name):
# 读取Excel文件
df = pd.read_excel(file_path, sheet_name=sheet_name)
# 获取指定列数据(去除表头)
data = df[column_name].dropna()
# 计算累计密度
sorted_data = np.sort(data)
yvals = np.arange(len(sorted_data))/float(len(sorted_data)-1)
# # 绘制累计密度图
# plt.figure(figsize=(10, 6))
log2_data = np.log2(sorted_data)
# plt.plot(log2_data, yvals)
# plt.xlabel("log2 normalized sgRNA read counts")
# plt.ylabel('Cumulative Probability')
# plt.title(f'Cumulative Density Plot of {column_name}')
# plt.grid(True)
# 计算分位数和均一度
q10 = np.percentile(log2_data, 10)
q90 = np.percentile(log2_data, 90)
diff = q90 - q10
pow_diff = 2 ** diff
# 设置中英文字体
plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文使用黑体
plt.rcParams['font.family'] = 'sans-serif' # 同时指定字体族
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
# 设置字体(单独设置Arial用于英文)
font = {'family': 'Arial', 'weight': 'bold', 'size': 14}
plt.rc('font', **font)
# 绘制图形
plt.figure(figsize=(10, 6))
plt.plot(log2_data, yvals)
# 获取当前坐标轴并设置刻度标签不加粗
ax = plt.gca()
ax.spines[:].set_linewidth(1) # 轴线条不加粗
for label in ax.get_xticklabels() + ax.get_yticklabels():
label.set_fontweight('normal') # 刻度数字不加粗
plt.xlabel("Log2 Normalized sgRNA Read Counts", fontsize=16, fontname='Arial', weight='normal') # 不加粗
plt.ylabel('Cumulative Probability', fontsize=16, fontname='Arial', weight='normal') # 不加粗
plt.title(f'Cumulative Density Plot of sgRNA Read Counts', fontsize=18, fontname='Arial', weight='normal') # 不加粗
plt.grid(True)
# 添加分位数标记线(保持中文SimHei)
plt.axvline(x=q10, color='r', linestyle='--', label=f'10%: {q10:.2f}')
plt.axvline(x=q90, color='g', linestyle='--', label=f'90%: {q90:.2f}')
plt.legend(prop={'family':'Arial'}) # 图例使用中文黑体
plt.legend()
# 添加差值标注(保持中文)
plt.text(0.05, 0.9,
f'Uniformity (90%/10%) = {pow_diff:.2f}',
transform=plt.gca().transAxes,
fontsize=16, weight='bold', family='Arial')
# 保存图片(文件名保持英文)
title = f'Cumulative_Density_Plot'
plt.savefig(f'{title}.jpg', dpi=300, bbox_inches='tight')
plt.savefig(f'{title}.pdf', bbox_inches='tight')
# 打印结果
print(f"10%分位数(X值): {q10:.2f}")
print(f"90%分位数(X值): {q90:.2f}")
print(f"90%-10%差值: {diff:.2f}")
print(f"2^(差值): {pow_diff:.2f}")
print(f"均一度(90%/10%): {pow_diff:.2f}")
plt.show()
# 示例用法
if __name__ == "__main__":
# 替换为您的实际文件路径、工作表名和列名
file_path = "./Drug-F1R1.count.xlsx"
sheet_name = "Drug-F1R1.count"
column_name = "plasmid"
read_and_plot_cumsum(file_path, sheet_name, column_name)