https://docs.google.com/document/d/1q2gciWRhVCAAnlvF2iRLuJ7whrGP6QjpsCMq1yWz7dU
Part 1、LINCS Phase I L1000--GSE92742
1、signature矩阵
library(cmapR)
library(tidyverse)
gctx_demo = parse_gctx("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx",
cid=1:4, rid=1:4)
gctx_demo@mat
# CPC005_A375_6H:BRD-A85280935-003-01-7:10
# 5720 0.7737690
# 466 -0.8184680
# 6009 0.1895723
# 2309 -0.1460308
# CPC005_A375_6H:BRD-A07824748-001-02-6:10
# 5720 -0.6455861
# 466 -0.8107487
# 6009 0.4590603
# 2309 -0.2246765
# CPC004_A375_6H:BRD-K20482099-001-01-1:10
# 5720 -5.449666
# 466 2.393775
# 6009 1.279790
# 2309 2.167868
# CPC005_A375_6H:BRD-K62929068-001-03-3:10
# 5720 0.1934077
# 466 -0.5822433
# 6009 -0.1789770
# 2309 -1.1820246
2、signature注释信息
- 从GEO下载的顺序与上面gct矩阵的sig顺序不一致,需要调整与gct保持一致,方便后续取子集操作。
如下是已经调整好的:
sig_info = data.table::fread("Fine_phase1_sig_info_473647.csv",
data.table = F)
col_meta <- read_gctx_meta("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", dim="col")
identical(col_meta$id, sig_info$sig_id)
# [1] TRUE
dim(sig_info)
# [1] 473647 12
t(sig_info[1,])
# [,1]
# sig_id "CPC005_A375_6H:BRD-A85280935-003-01-7:10"
# pert_id "BRD-A85280935"
# pert_iname "quinpirole"
# pert_type "trt_cp"
# cell_id "A375"
# pert_dose "10.0"
# pert_dose_unit "µM"
# pert_idose "10 µM"
# pert_time "6"
# pert_time_unit "h"
# pert_itime "6 h"
# distil_id "CPC005_A375_6H_X1_B3_DUO52HI53LO:K06|CPC005_A375_6H_X2_B3_DUO52HI53LO:K06|CPC005_A375_6H_X3_B3_DUO52HI53LO:K06"
3、gene注释信息
- 同上也需要调整顺序
gene_info = data.table::fread("Fine_phase1_gene_info_12328.csv",
data.table = F)
row_meta <- read_gctx_meta("GSE92742_Broad_LINCS_Level5_COMPZ.MODZ_n473647x12328.gctx", dim="row")
identical(as.integer(row_meta$id), gene_info$pr_gene_id)
# [1] TRUE
dim(gene_info)
# [1] 12328 5
t(gene_info[1,])
# 1
# pr_gene_id "5720"
# pr_gene_symbol "PSME1"
# pr_gene_title "proteasome activator subunit 1"
# pr_is_lm "1" # landmark 978
# pr_is_bing "1" # landmark + best inferred gene 10174
4、pertubation(化合物)注释信息
- 可结合signature注释信息,做进一步筛选
pert_info = data.table::fread("phase1_pert_info.csv")
t(pert_info[1,])
# [,1]
# V1 "1"
# pert_id "56582"
# pert_iname "AKT2"
# pert_type "trt_oe"
# is_touchstone "0"
# inchi_key_prefix "-666"
# inchi_key "-666"
# canonical_smiles "-666"
# pubchem_cid "-666"
table(pert_info$pert_type) %>% sort(decreasing = T)
# trt_cp trt_sh trt_sh.cgs trt_sh.css
# 20413 18493 4345 3807
# trt_oe trt_lig trt_oe.mut ctl_vector
# 3492 622 135 61
#...
5、细胞系注释信息
- 可结合signature注释信息,做进一步筛选
cell_info = data.table::fread("phase1_cell_info.csv")
t(cell_info[1,])
# [,1]
# cell_id "A375"
# cell_type "cell line"
# base_cell_id "A375"
# precursor_cell_id "-666"
# modification "-666"
# sample_type "tumor"
# primary_site "skin"
# subtype "malignant melanoma"
# original_growth_pattern "adherent"
# provider_catalog_id "CRL-1619"
# original_source_vendor "ATCC"
# donor_age "54"
# donor_sex "F"
# donor_ethnicity "-666"
table(cell_info$sample_type)
# -666 normal primary tumor
# 1 19 8 70
6、signature干扰效应评价
- Replicate Correlation Coefficient
- signature strength
- Transcriptional Activity Score
sig_metrics = data.table::fread("phase1_sig_metrcs.csv")
identical(sig_metrics$sig_id, col_meta$id)
# [1] TRUE
t(sig_metrcs[1,])
# [,1]
# sig_id "CPC005_A375_6H:BRD-A85280935-003-01-7:10"
# pert_id "BRD-A85280935"
# pert_iname "quinpirole"
# pert_type "trt_cp"
# distil_cc_q75 "0.11" (Replicate Correlation Coefficient)
# distil_ss "2.84895" (signature strength)
# ngenes_modulated_up_lm "18"
# ngenes_modulated_dn_lm "15"
# tas "0.101169" (Transcriptional Activity Score)
# pct_self_rank_q25 "7.6087"
# is_exemplar "0"
# distil_nsample "3"
Part 2、LINCS Phase II L1000--GSE70138
- https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE70138
- 收集、整理数据步骤基本同上,不赘述了。