数据处理tips

转置数据框并将行名变为列名,列名变为行名

df <- read.csv("工作簿1.csv",check.names = FALSE, stringsAsFactors = FALSE, row.names = 1)
df_t <- as.data.frame(t(df))
write.csv(df_t, "转置后的工作簿2.csv", row.names = TRUE)

dplyr包提取数据的tips

library(dplyr)
aaa<-  read.csv('file1.csv')
hhh<- read.csv('file2.csv')
#在hhh中的fff列中找到和根据aaa中的bbb列相同的值
#并在hhh中把这些行提取出来
result1 <- hhh %>% filter(fff %in% aaa$bbb)

#在hhh中的fff列中找到和根据aaa中的ccc列大于1,且bbb列相同的值
#并在hhh中把这些行提取出来并按照NES列降序排列
sorted_rows <- hhh %>%
  filter(fff %in% aaa$bbb[aaa$ccc > 1]) %>%
  arrange(desc(NES))
  
##提取sorted_rows中pct.1列减去pct.2列大于0.5的行
selected_rows <- sorted_rows %>%
  filter(pct.1 - pct.2 > 0.5)
  
#在hhh中提取fff列同时有fatty和inna,或者只有cell,或者3个都有的行
selected_rows <- hhh %>%
  filter(grepl("fatty.*inna|inna.*fatty|cell", fff, ignore.case = TRUE))
#fatty.*inna:表示字符串中先出现"fatty",后跟任意字符(包括0个字符),再出现"inna"
#inna.*fatty:表示字符串中先出现"inna",后跟任意字符,再出现"fatty"
#cell:表示字符串中出现"cell"

提取几个文件中都腹肌到的通路信息

matched_in_hhh <- hhh %>% filter(Description %in% aaa$Description)%>%
  mutate(from = "iii")
matched_in_aaa <- aaa %>% filter(Description %in% hhh$Description)%>%
  mutate(from = "ppp")
combined2 <- bind_rows(matched_in_hhh, matched_in_aaa) %>%
  select(from, everything())

write.csv(combined2, "路径/到/新文件名.csv", row.names = FALSE)

提取所有通路富集到的基因名称,并按照通路名为列名,内容为基因名,生成新文件

library(dplyr)
library(tidyr)
library(readr)

data <- read_csv('DEG_of_Plasma B cells_ko_vs_wt_GSEA_GOBP.csv')
#这里是简化代码
final_data <- data$core_enrichment %>%
  strsplit("/") %>%
  lapply(function(x) {
    length(x) <- max(sapply(., length)); x
  }) %>%
  do.call(rbind, .) %>%
  as.data.frame(stringsAsFactors = FALSE) %>%
  mutate(pathwaynames = data$Description) %>%
  select(pathwaynames, everything()) %>%
  t() %>%
  as.data.frame()
# 替换NA值为""(空字符串)
cleaned_data <- final_data %>%
  mutate_all(~ifelse(is.na(.), "", .))
write.csv(cleaned_data, 'nnn1.csv', row.names = FALSE)

#####管道解释
# 步骤1:分割字符串
split_data <- strsplit(data$core_enrichment, "/")
# 步骤2:调整长度
adjusted_data <- lapply(split_data, function(x) {
  length(x) <- max(sapply(split_data, length))
  x
})
# 步骤3:合并为矩阵
matrix_data <- do.call(rbind, adjusted_data)
# 步骤4:转换为数据框
df_data <- as.data.frame(matrix_data, stringsAsFactors = FALSE)
# 步骤5:添加路径名列
df_data$pathwaynames <- data$Description
# 步骤6:重排列顺序
df_data <- df_data%>% select(pathwaynames,everything())
# 步骤7:转置数据框
transposed_df <- t(df_data)
# 步骤8:转换为数据框
final_data <- as.data.frame(transposed_df, stringsAsFactors = FALSE)
# 替换NA值为""(空字符串)
cleaned_data <- final_data %>%
  mutate_all(~ifelse(is.na(.), "", .))
write.csv(cleaned_data, 'nnn2.csv', row.names = FALSE)
转化大小写
library(dplyr)
library(stringr)
####全部改为大写的单个list------------------------------------------------------------------------------------------------
paper<-"ALB,SERPINA1,HNF4A,EPCAM,CD3D,CD3E,NKG7,CD68,CD14,CD163,
CD1C,CLEC4C,KIT, IGHG1,JCHAIN,CD79A,VWF,PECAM1,
FCGR2B,ACTA2,COL1A1,COL1A2"
papermarker<-str_to_upper(trimws(strsplit(paper,',')[[1]]))
papermarker
# 将基因转为list 
features <- list(papermarker)
####全部改为小写的单个list--------------------------------------------------------------------------------------------------------
paper <- "ALB,SERPINA1,HNF4A,EPCAM,CD3D,CD3E,NKG7,CD68,CD14,CD163,
CD1C,CLEC4C,KIT,IGHG1,JCHAIN,CD79A,VWF,PECAM1,FCGR2B,ACTA2,COL1A1,COL1A2"
# 将字符串分割为向量
paper_markers <- strsplit(trimws(paper), ',')[[1]]
# 使用data.frame或tibble创建数据框架
paper_df <- tibble(gene_names = paper_markers)
# 使用mutate和str_to_title进行转换
# 注意: str_to_title函数会将每个单词的首字母转为大写,这可能不完全符合要求,如果每个基因名称视为单个“单词”则适用
# 对于更精确的控制(每个基因名的首字母大写,其余小写),需要稍微修改
paper_df <- paper_df %>%
  mutate(gene_names = str_to_lower(gene_names), # 首先转换为小写
         gene_names = str_replace(gene_names, "^(.)", function(x) toupper(x))) # 然后将第一个字母转换为大写
features3 <- list(paper_df$gene_names)

####数据库基因集提取---------------------------------------------------------------------------------------------------------------------------------------
library(dplyr)
library(stringr)
library(msigdbr)
homo_KEGG = msigdbr(species = "Homo sapiens",
                    category = "C2",
                    subcategory = "KEGG") %>% dplyr::select(gs_name,gene_symbol)#这里可以选择gene symbol,也可以选择ID
#基因集是list
homo_KEGG_gene = homo_KEGG %>% split(x =.$gene_symbol, f =.$gs_name)
#选择其中一条通路(我这里选择的是氨基酸和核苷酸糖代谢),将其也转为list
features1 <- list(homo_KEGG_gene$KEGG_AMINO_SUGAR_AND_NUCLEOTIDE_SUGAR_METABOLISM)

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容