基本运算
> x = 3 ** 2
> x
[1] 9
>
> x = 3 ^ 2
> x
[1] 9
>
> x = sqrt(64)
> x
[1] 8
>
> x = -7
> y = abs(x)
> y
[1] 7
>
对数 指数
> x = exp(1) # exp()是指自然数e的x次方
> x
[1] 2.718282
>
> x = exp(3) # e的3次方
> x
[1] 20.08554
>
> x = exp(0.5) # e的0.5次方
> x
[1] 1.648721
>
>
> # exp()和log()互为反函数
> # log()以e为底的对数
> # 一般基底的对数,语法是log(x,m),其中m是底。如果底是10,也可这么写log10()
> x = log(2) # 计算以e为底的对数
> x
[1] 0.6931472
>
> x = log(2, 10) # 计算以10为底的对数
> x
[1] 0.30103
>
> x = log10(2) # 计算以10为底的对数
> x
[1] 0.30103
>
> x = 1.2 * 10^4
> x
[1] 12000
>
> x = 1.2e4
> x
[1] 12000
>
近似数
> round(98.345, digits = 2)
[1] 98.34
> round(98.345, 2)
[1] 98.34
>
> signif(1234567.896543, digits = 7) #第二个位置是有效数字的个数
[1] 1234568
> signif(1234567.896543, digits = 8)
[1] 1234568
> signif(1234567.896543, digits = 6)
[1] 1234570
> signif(1234567.896543, digits = 5)
[1] 1234600
> signif(1234567.896543, digits = 1)
[1] 1e+06
>
> # 近似函数floo(), ceiling(), trunc()可直接区整数
>
设置重复
> # rep(x, times=重复次数,each=每次每个元素的重复次数,length.out=向量长度)
>
简单统计
> # sum(), max(), min(), mean()
>
> # prod() 计算所有元素的积
>
> # cumsum()计算所有元素的累计和
>
> # cumprod()计算所有元素的累计积
>
> # cummax()可返回各元素从向量起点到该元素位置间所有元素的最大值
>
> # cummin()可返回各元素从向量起点到该元素位置间所有元素的最小值
>
> # diff()返回各元素与下一元素的差
>
> # sort(x, decreasing = FALSE)排序
>
> # rank()返回将元素从小到大排序后的位置编号
>
> # rev()将向量对象颠倒排列
>
> # length()计算向量对象长度,即向量对象元素个数
>
> # sd()标准差
>
> # var()样本变异数
>
查询确认/转换数据类型
> # is.integer()
> # is.numeric()
> # is.double()
> # is.character()
> # is.matrix()
> # is.array()
>
> # as.character()
> # as.numeric()
>
>
> # str()探索对象结构,了解数据类型/长度/内容
> # class()对向量对象而言,可使用它了解对象元素的数据类型
>
逻辑运算
> # x & y 如果x和y均为T,则传回T
> # x | y 如果x或y为T,则传回T
> # !x 传回非x
> # xor (x, y) 相当于XOR运算,如果x和y不同,传回T
which函数
> # which()所使用的参数是一个比较表达式,可以返回符合条件的索引值
> x <- c(6, 5, 3, 4, 5)
> which( x > 4 )
[1] 1 2 5
>
> # wich.max():给出最大值的第一个索引值。注意,一个向量中,最大值可能出现多次。
> # wich.min()
NA的去除
> # 关于NA。可先用is.na()判断向量中是否含有NA,然后用!is.na()即可删除NA
> x <- c(9, 1, NA, 8, 6)
> x[x>5 & !is.na(x)]
[1] 9 8 6
> x
[1] 9 1 NA 8 6
any函数
> # any(),给与比较条件,只要参数向量对象有1个元素是T,则返回T
>
> # 向量对象元素的命名obj <- c (name1 = data1, name2 = data2, ……)
names等函数
> # names()可查询向量对象元素名称,也可更改向量对象元素名称。如想删除向量对象的元素的名称,将其设为NULL即可。
>
> # matrix(data, nrow=?, ncol=?, byrow=logical, dimnames=NULL)
> x <- matrix(3:11, nrow = 3, byrow=T, dimnames=list(rownames,colnames))
Error in matrix(3:11, nrow = 3, byrow = T, dimnames = list(rownames, colnames)) :
length of 'dimnames' [1] not equal to array extent
> colnames <- c("col1", "col2", "col3")
> rownames <- c("row1","row2","row3")
> x
[1] 9 1 NA 8 6
矩阵
> # ncol()可得到矩阵列数
> # nrow()
> # dim()获得矩阵的行和列
> # length()也可用于矩阵和数组对象的元素个数
> # 将向量组成矩阵,用cbind()或rbind()
> # 如何取得元素值?
> # 如何修改元素值?
> # 如何取得和修改矩阵对象的行名和列名?也可以用dimnames()来获得
> # 在矩阵中可用行名和列名代替索引取得元素值
> # rowSums()
> # colSums()
> # rowMeans()
> # colMeans()
> # t()转置
factor
> # 使用factor()或as.factor()函数建立因子。参数当中,x向量:是要转为因子得向量;levels:原x内元素的可能值
>
> yes.or.no <- c("yes", "no", "no", "yes", "yes")
> first.factor <- factor(yes.or.no)
> first.factor
[1] yes no no yes yes
Levels: no yes
>
>
> yes.or.no <- c("yes", "no", "no", "yes", "yes")
> second.factor <- factor(yes.or.no, levels = c("yes", "no")) # 指定顺序
> second.factor
[1] yes no no yes yes
Levels: yes no
>
>
> # 指定缺失的levels值
> directions <- c("east", "west", "north", "east", "west")
> a.factor <- factor(directions)
> a.factor # 缺少一个factor, 可补上
[1] east west north east west
Levels: east north west
>
> b.factor <- factor(a.factor, levels = c("east", "west", "south", "north")) #补上了
> b.factor
[1] east west north east west
Levels: east west south north
>
> # 因子带标签
> c.factor <- factor(b.factor, levels = c("east", "west", "south", "north"), labels = c("E", "W", "S", "N"))
> c.factor
[1] E W N E W
Levels: E W S N
>
> # 因子的level参数。用nlevels()可以传回levels的数量,length()是传回因子元素的数量。
>
> # 数值型因子在转换时常见的错误
> temperature <- factor(c(28, 32, 30, 34, 32, 34))
> str(temperature) #level有4个值28/30/32/34,分别对应1/2/3/4.注意下面的情况:
Factor w/ 4 levels "28","30","32",..: 1 3 2 4 3 4
> as.numeric(temperature) # 解决方法如下:
[1] 1 3 2 4 3 4
> as.numeric(as.character(temperature))
[1] 28 32 30 34 32 34
>
> #有序因子
> str1 <- c("A", "B", "A", "C", "D", "B", "D")
> str1.order <- factor(str1, levels = c("D", "C", "B", "A"), ordered = T)
> str1.order
[1] A B A C D B D
Levels: D < C < B < A
>
> #tabel()统计在因子的所有元素中,levels中各值出现的次数。即level可能取得的值的出现的次数。
> #state.name向量集收集了美国50各州,state.region是因子,记录每个州属于美国那个区。
> state.region
[1] South West West South West West Northeast South South South West
[12] West North Central North Central North Central North Central South South Northeast South Northeast North Central
[23] North Central South North Central West North Central West Northeast Northeast West Northeast South
[34] North Central North Central South West Northeast Northeast South North Central South South West
[45] Northeast South West South North Central West
Levels: Northeast South North Central West
> table(state.region)
state.region
Northeast South North Central West
9 16 12 13
数据框
> # 数据框
> name <- c("Kevin", "Peter", "Frank", "Maggie")
> gender <- c("M", "M", "M", "F")
> height <- c(170, 175, 165, 168)
> info <- data.frame(name, gender, height)
> info
name gender height
1 Kevin M 170
2 Peter M 175
3 Frank M 165
4 Maggie F 168
>
> #分别用names()和colnames()查询info数据框的列名
> names(info)
[1] "name" "gender" "height"
> colnames(info)
[1] "name" "gender" "height"
>
> #查询行名
> row.names(info)
[1] "1" "2" "3" "4"
>
> #用names()给第一列改名
> names(info)[1] <- "n.name"
> info
n.name gender height
1 Kevin M 170
2 Peter M 175
3 Frank M 165
4 Maggie F 168
>
> # 认识数据框结构
> str(info) #发现字符串变成因子了。这是R默认。如不想要,则stringsAsFactors = F
'data.frame': 4 obs. of 3 variables:
$ n.name: Factor w/ 4 levels "Frank","Kevin",..: 2 4 1 3
$ gender: Factor w/ 2 levels "F","M": 2 2 2 1
$ height: num 170 175 165 168
>
> # 取数据框内容
> info[, "n.name"]
[1] Kevin Peter Frank Maggie
Levels: Frank Kevin Maggie Peter
> info[2, ]
n.name gender height
2 Peter M 175
> info$n.name
[1] Kevin Peter Frank Maggie
Levels: Frank Kevin Maggie Peter
> info[, 1] #返回的是向量
[1] Kevin Peter Frank Maggie
Levels: Frank Kevin Maggie Peter
> info[1] #返回的四数据框
n.name
1 Kevin
2 Peter
3 Frank
4 Maggie
>
> # 用$为数据框添加列数据
> weight <- c (65, 71, 58, 55)
> info$weight <- weight
> info
n.name gender height weight
1 Kevin M 170 65
2 Peter M 175 71
3 Frank M 165 58
4 Maggie F 168 55
>
> rm(list = ls())
> name <- c("Kevin", "Peter", "Frank", "Maggie")
> gender <- c("M", "M", "M", "F")
> height <- c(170, 175, 165, 168)
> info <- data.frame(name, gender, height)
> age <- c(19, 20, 20, 19)
> score <- c(88, 91, 75, 80)
> addinfo <- data.frame(age, score)
> addinfo
age score
1 19 88
2 20 91
3 20 75
4 19 80
> newinfo <- cbind(info, addinfo)
> newinfo
name gender height age score
1 Kevin M 170 19 88
2 Peter M 175 20 91
3 Frank M 165 20 75
4 Maggie F 168 19 80
>
> #注意,数据框是一些列的列向量组成,如果把矩阵转为数据框,则用到data.frame()函数。
>
list
> # list
>
> x <- c(7, 8, 6, 11, 9, 12, 12, 8, 9, 15, 7, 12)
> colnames <- c("1st", "2nd", "3rd", "4th", "5th", "6th")
> rownames <- c("lin","ge")
> team.cal <- matrix(x, 2, byrow = T, dimnames=list(rownames,colnames))
> baskets.cal <- list("zhang", "2018-12", team.cal) # 注意这里的team.cal不要加引号
> baskets.cal
[[1]]
[1] "zhang"
[[2]]
[1] "2018-12"
[[3]]
1st 2nd 3rd 4th 5th 6th
lin 7 8 6 11 9 12
ge 12 8 9 15 7 12
>
> # 给刚才的list里的对象命名
> n.baskets.cal <- list(teamname = "zhang", season = "2018-12", score.info = team.cal) # 注意这里的team.cal不要加引号
> n.baskets.cal
$`teamname`
[1] "zhang"
$season
[1] "2018-12"
$score.info
1st 2nd 3rd 4th 5th 6th
lin 7 8 6 11 9 12
ge 12 8 9 15 7 12
>
> #names()函数可以获得及修改list里对象的名称
> names(n.baskets.cal)
[1] "teamname" "season" "score.info"
> names(n.baskets.cal)[1] <- "great"
> n.baskets.cal
$`great`
[1] "zhang"
$season
[1] "2018-12"
$score.info
1st 2nd 3rd 4th 5th 6th
lin 7 8 6 11 9 12
ge 12 8 9 15 7 12
>
> # 获得list里的元素
> n.baskets.cal$great
[1] "zhang"
> n.baskets.cal$score.info[2, 4]
[1] 15
> n.baskets.cal[[3]][2, 4]
[1] 15
>
> # list内的对象名可当索引
> n.baskets.cal[["score.info"]]
1st 2nd 3rd 4th 5th 6th
lin 7 8 6 11 9 12
ge 12 8 9 15 7 12
> n.baskets.cal[names(n.baskets.cal) != "great"]
$`season`
[1] "2018-12"
$score.info
1st 2nd 3rd 4th 5th 6th
lin 7 8 6 11 9 12
ge 12 8 9 15 7 12
>
> # 如何修改、添加、删除(赋值为NULL)list里元素的内容?
> # 如何合并list?
>
文本操作
> # 语句分割
> x <- c("Hello R World")
> x
[1] "Hello R World"
> strsplit(x, " ") # 以空格为界拆分。注意返回的是什么格式
[[1]]
[1] "Hello" "R" "World"
>
> # 延上例,拆分后存入向量对象内
> a <- strsplit(x, " ")[[1]]
> a
[1] "Hello" "R" "World"
>
> # toupper() 小写变大写
> # tolower()
>
> # uniqe() 使向量内容不重复出现
>
> # paste()的collapse参数
> coffee.str <- c("boiling", "coffee", "brings", "out", "a", "bitterly", "taste")
> paste(coffee.str)
[1] "boiling" "coffee" "brings" "out" "a" "bitterly" "taste"
> paste(coffee.str, collapse = " ") # 字符串以空格相连
[1] "boiling coffee brings out a bitterly taste"
>
>
> # paste()主要作用是将两个或多个向量连接
> str_1 <- letters[1:6]
> str_2 <- 1:6
> paste(str_1, str_2)
[1] "a 1" "b 2" "c 3" "d 4" "e 5" "f 6"
> paste(str_1, str_2, sep = "") # 去掉空格
[1] "a1" "b2" "c3" "d4" "e5" "f6"
> paste(str_1, str_2, sep = "", collapse = " ")
[1] "a1 b2 c3 d4 e5 f6"
>
>
> # 使用索引值搜索
> # 列出state.name数据集内第2到第4个子字符串
> substr(state.name, start = 2, stop = 4)
[1] "lab" "las" "riz" "rka" "ali" "olo" "onn" "ela" "lor" "eor" "awa" "dah" "lli" "ndi" "owa" "ans" "ent" "oui" "ain" "ary" "ass" "ich" "inn" "iss" "iss" "ont" "ebr"
[28] "eva" "ew " "ew " "ew " "ew " "ort" "ort" "hio" "kla" "reg" "enn" "hod" "out" "out" "enn" "exa" "tah" "erm" "irg" "ash" "est" "isc" "yom"
>
> # grep(pattern, x) pattern,搜索目标;x,字符串向量
> grep("M", state.name) # 返回的是索引值
[1] 19 20 21 22 23 24 25 26 31
> state.name[grep("M", state.name)]
[1] "Maine" "Maryland" "Massachusetts" "Michigan" "Minnesota" "Mississippi" "Missouri" "Montana" "New Mexico"
> state.name[grep(" ", state.name)]# 搜索州名中有空格的
[1] "New Hampshire" "New Jersey" "New Mexico" "New York" "North Carolina" "North Dakota" "Rhode Island" "South Carolina" "South Dakota"
[10] "West Virginia"
>
> # 字符串内容更改
> # sub(pattern, replacement, x) 其中replacement用空字符""代替,相当于删除。
>
> state.name[grep("New|South", state.name)] # New|South 不要有空格
[1] "New Hampshire" "New Jersey" "New Mexico" "New York" "South Carolina" "South Dakota"
>
> str_a <- c("ch6.xls", "ch7.xls", "ch7.c", "ch7.doc", "ch8.xls")
> str_b <- c("ch.xls", "ch7.xls", "ch77.xls", "ch87.xls", "ch88.xls")
> str_a[grep("ch(6|7).xls", str_a)]
[1] "ch6.xls" "ch7.xls"
> str_b[grep("ch(7*|8*).xls",str_b)] # 注意,*代笔0次或多次;+代表1次或多次
[1] "ch.xls" "ch7.xls" "ch77.xls" "ch88.xls"
> str_b[grep("ch(7+|8+).xls", str_b)]
[1] "ch7.xls" "ch77.xls" "ch88.xls"