转置数据框并将行名变为列名,列名变为行名
df <- read.csv("工作簿1.csv",check.names = FALSE, stringsAsFactors = FALSE, row.names = 1)
df_t <- as.data.frame(t(df))
write.csv(df_t, "转置后的工作簿2.csv", row.names = TRUE)
dplyr包提取数据的tips
library(dplyr)
aaa<- read.csv('file1.csv')
hhh<- read.csv('file2.csv')
#在hhh中的fff列中找到和根据aaa中的bbb列相同的值
#并在hhh中把这些行提取出来
result1 <- hhh %>% filter(fff %in% aaa$bbb)
#在hhh中的fff列中找到和根据aaa中的ccc列大于1,且bbb列相同的值
#并在hhh中把这些行提取出来并按照NES列降序排列
sorted_rows <- hhh %>%
filter(fff %in% aaa$bbb[aaa$ccc > 1]) %>%
arrange(desc(NES))
##提取sorted_rows中pct.1列减去pct.2列大于0.5的行
selected_rows <- sorted_rows %>%
filter(pct.1 - pct.2 > 0.5)
#在hhh中提取fff列同时有fatty和inna,或者只有cell,或者3个都有的行
selected_rows <- hhh %>%
filter(grepl("fatty.*inna|inna.*fatty|cell", fff, ignore.case = TRUE))
#fatty.*inna:表示字符串中先出现"fatty",后跟任意字符(包括0个字符),再出现"inna"
#inna.*fatty:表示字符串中先出现"inna",后跟任意字符,再出现"fatty"
#cell:表示字符串中出现"cell"
提取几个文件中都腹肌到的通路信息
matched_in_hhh <- hhh %>% filter(Description %in% aaa$Description)%>%
mutate(from = "iii")
matched_in_aaa <- aaa %>% filter(Description %in% hhh$Description)%>%
mutate(from = "ppp")
combined2 <- bind_rows(matched_in_hhh, matched_in_aaa) %>%
select(from, everything())
write.csv(combined2, "路径/到/新文件名.csv", row.names = FALSE)
提取所有通路富集到的基因名称,并按照通路名为列名,内容为基因名,生成新文件
library(dplyr)
library(tidyr)
library(readr)
data <- read_csv('DEG_of_Plasma B cells_ko_vs_wt_GSEA_GOBP.csv')
#这里是简化代码
final_data <- data$core_enrichment %>%
strsplit("/") %>%
lapply(function(x) {
length(x) <- max(sapply(., length)); x
}) %>%
do.call(rbind, .) %>%
as.data.frame(stringsAsFactors = FALSE) %>%
mutate(pathwaynames = data$Description) %>%
select(pathwaynames, everything()) %>%
t() %>%
as.data.frame()
# 替换NA值为""(空字符串)
cleaned_data <- final_data %>%
mutate_all(~ifelse(is.na(.), "", .))
write.csv(cleaned_data, 'nnn1.csv', row.names = FALSE)
#####管道解释
# 步骤1:分割字符串
split_data <- strsplit(data$core_enrichment, "/")
# 步骤2:调整长度
adjusted_data <- lapply(split_data, function(x) {
length(x) <- max(sapply(split_data, length))
x
})
# 步骤3:合并为矩阵
matrix_data <- do.call(rbind, adjusted_data)
# 步骤4:转换为数据框
df_data <- as.data.frame(matrix_data, stringsAsFactors = FALSE)
# 步骤5:添加路径名列
df_data$pathwaynames <- data$Description
# 步骤6:重排列顺序
df_data <- df_data%>% select(pathwaynames,everything())
# 步骤7:转置数据框
transposed_df <- t(df_data)
# 步骤8:转换为数据框
final_data <- as.data.frame(transposed_df, stringsAsFactors = FALSE)
# 替换NA值为""(空字符串)
cleaned_data <- final_data %>%
mutate_all(~ifelse(is.na(.), "", .))
write.csv(cleaned_data, 'nnn2.csv', row.names = FALSE)
转化大小写
library(dplyr)
library(stringr)
####全部改为大写的单个list------------------------------------------------------------------------------------------------
paper<-"ALB,SERPINA1,HNF4A,EPCAM,CD3D,CD3E,NKG7,CD68,CD14,CD163,
CD1C,CLEC4C,KIT, IGHG1,JCHAIN,CD79A,VWF,PECAM1,
FCGR2B,ACTA2,COL1A1,COL1A2"
papermarker<-str_to_upper(trimws(strsplit(paper,',')[[1]]))
papermarker
# 将基因转为list
features <- list(papermarker)
####全部改为小写的单个list--------------------------------------------------------------------------------------------------------
paper <- "ALB,SERPINA1,HNF4A,EPCAM,CD3D,CD3E,NKG7,CD68,CD14,CD163,
CD1C,CLEC4C,KIT,IGHG1,JCHAIN,CD79A,VWF,PECAM1,FCGR2B,ACTA2,COL1A1,COL1A2"
# 将字符串分割为向量
paper_markers <- strsplit(trimws(paper), ',')[[1]]
# 使用data.frame或tibble创建数据框架
paper_df <- tibble(gene_names = paper_markers)
# 使用mutate和str_to_title进行转换
# 注意: str_to_title函数会将每个单词的首字母转为大写,这可能不完全符合要求,如果每个基因名称视为单个“单词”则适用
# 对于更精确的控制(每个基因名的首字母大写,其余小写),需要稍微修改
paper_df <- paper_df %>%
mutate(gene_names = str_to_lower(gene_names), # 首先转换为小写
gene_names = str_replace(gene_names, "^(.)", function(x) toupper(x))) # 然后将第一个字母转换为大写
features3 <- list(paper_df$gene_names)
####数据库基因集提取---------------------------------------------------------------------------------------------------------------------------------------
library(dplyr)
library(stringr)
library(msigdbr)
homo_KEGG = msigdbr(species = "Homo sapiens",
category = "C2",
subcategory = "KEGG") %>% dplyr::select(gs_name,gene_symbol)#这里可以选择gene symbol,也可以选择ID
#基因集是list
homo_KEGG_gene = homo_KEGG %>% split(x =.$gene_symbol, f =.$gs_name)
#选择其中一条通路(我这里选择的是氨基酸和核苷酸糖代谢),将其也转为list
features1 <- list(homo_KEGG_gene$KEGG_AMINO_SUGAR_AND_NUCLEOTIDE_SUGAR_METABOLISM)