http://blog.sina.com.cn/s/blog_6bc5205e0102vma9.html
install.packages
1、从网络上直接安装
install.packages("包名字",lib="安装目录",repos="包所在的网址))。也可通过参数contriburl指定包所在的网址
例:
install.packages(“stepNorm”,contriburl=”http://www.your.url”,dependencies=TRUE)
2、本地安装
1)install.packages("包文件的完整路径“)
2)在命令行下(不是R窗口)直接输入:R CMD INSTALL 包文件的完整路径
installed.packages()# 查看已经安装的包 配合install.packages() 写一个循环可以批量安装包
update.packages()#更新已安装的包
path.packages() #查看已经安装了哪些包
.libPaths()#查看包的安装目录
.libpaths('youlibray')#修改包的安装目录
.libpaths(c('newlibray','oldlibray')
library("your package", lib.loc="/yourlibrary/")#下载到临时文件夹的调用方法
R.version 、R.version.string 查看当前版本
R帮助函数
help.start( )打开帮助文档首页
help()?
?
#读取前五行数据
data <- read.table("datatable.txt", nrows = 5)
getwd();
setwd('D:\\RStudio\\www');
data2 <- read.csv('new.csv',encoding = 'UTF-8');
#查看第一列数
data2[,1];
#将第一列数改为分类结构
data2[,1] <- factor(data2[,1]);
data2[,1];
#统一映射为另外一个数据标签
data2[,1] <- factor(data2[,1],labels = c('三年一班','三年二班','三年三班'));
data2[,1];;
data_1 <- read.csv('new_1.csv',encoding = 'utf8');
fix(data_1);
data_1 [,1] <- factor(data_1[,1],levels = c(1,2,3),labels = c('三年一班','三年二班','三年三班'));
data_1;
#转换为字符串的向量
as.vector(data_1[,1]);
data_1;
#转换为数字向量
as.numeric(data_1[,1]);
#连续变量的离散化
score <- data_1[,3];
score1 <- cut(score,breaks = 3);
score1;
#切分成自己设置的组
score2 <- cut(score,breaks = c(79,100,120,161));
score2;
#一个有序因子
score3 <- ordered(score1,labels=c('bad','ok','good'));
score3;
table(score3);
#可排序的离散分类结构
datax <- read.csv('new.csv',encoding = 'UTF-8');
datax[,1] <-
ordered(datax[,1],
levels <- c(1,3,2),
labels <- c('一班','三班','二班')
);
table(datax[,1]);
datax[order(datax[,1]),];
datax[order(datax[,3]),];
#list 的创建方式
#无tag试
j <- list('a',500,T);
#有tag方式
y <- list(name='fudegang',salary=10000,union=T);
#list的访问方式
#1list_name$tag_name
y$name;
y$salary;
#2list_name[[tga]]
y[['name']];
#3list_name[[index]]
y[[1]];
#以数组的形式访问返回的是整个list
y[];
y[1]$name;
#访问标签
labels(y);
labels(y[1]);
#list 一次只能访问一个数据
list[1:2];
#list的修改
#增加
y$sex <- '男';
y[5] <- 170;
y[3];
#修改
y$sex <- '女';
y;
#删除
y$sex <- NULL;
y;
#查
y == 'fudegang';
#查看长度
length(y);
#dataframe
#dataframe 定义
name <- c('张三','李四','王五');
age <-c(23,33,56);
df <- data.frame(name,age);
df;
#修改列名
colnames(df);
names(df);
names(df) <- c('name2','age2');
colnames(df);
#修改一个列名
names(df)[2] <- 'age3';
df;
names(df)[names(df) == 'age3'] <-'age4';
df;
#修改行名
row.names(df);
row.names(df) <-0:2;
df;
#删除行
df1 <- df[-1,];
df1;
df2 <-df[-2,];
df2;
#增加行
df[,'sex'] <- c(0,1,1);
df;
#3.1 数据清洗
#重复值处理 unique()
dd <- read.csv('1.csv',encoding = 'UTF-8');
dd;
new_dd <- unique(dd);
new_dd;
#缺失值的处理(补齐,删除,不处理)
#删除行 na.omit()
dd_1 <- read.csv('2.csv',encoding = 'UTF-8');
dd_1;
new_dd_1 <- na.omit(dd_1);
new_dd_1;
#清洗空格 trim() install.package(raster) library(raster)
dd_3 <- read.csv('3.csv');
dd_3;
install.packages('raster',lib = .libPaths());
path.package();
.libPaths();
path.package();
path.package();
install.packages('raster');
library('raster');
dd_3 <- read.csv('3.csv');
dd_3;
getwd();
setwd('D:\\RStudio\\www');
dd_3 <- read.csv('3.csv');
new_dd_3 <- trim(dd_3);
View(new_dd_3);
#3.2数据抽取
#字段抽取substr(x,start,stop)
tel <- '13811568128';
band <- substr(tel,1,3);
band;
area <- substr(tel,4,7);
area;
num <- substr(tel,8,11);
num;
getwd();
tels <- read.csv('1.csv');
fix(tels);
bands <- substr(tels[,1],1,3);
bands;
areas <- substr(tels[,1],4,7);
areas;
nums <- substr(tels[,1],8,11);
nums;
num_tels <-data.frame(tels,bands,areas,nums);
fix(num_tels);
#字段的拆分 str_split_fixed(x,split,n) 类似excel的分列功能
.libPaths();
install.packages('stringr',lib = .libPaths());
library(stringr);
items <- read.csv('2.csv',encoding = 'UTF-8',stringsAsFactors = FALSE);
fix(items);
new_bands <- str_split_fixed(items[,1],' ',n = 2 );
new_bands;
fix(new_bands);
new_items <- data.frame(new_bands[,1],items);
fix(new_items);
names(new_items) <- c('band','item');
colnames(new_items);
fix(new_items);
#数据的抽取 subset(x,condition) 类似excel的过滤功能
getwd();
item3 <- read.table('3.csv',header = TRUE ,sep = "|",fileEncoding = 'utf-8',stringsAsFactors = FALSE);
fix(item3);
sub_item3 <- subset(item3,comments>100);
fix(sub_item3);
#3.3 数据合并
#记录合并 rbind(dataframe1,dataframe2,...)
data1 <- read.table('1_1.csv',sep = "|", header = TRUE, fileEncoding = 'utf-8', stringsAsFactors = F);
data2 <- read.table('1_2.csv', sep = '|', header = T, fileEncoding = 'utf-8', stringsAsFactors = F);
data3 <- read.table('1_3.csv', sep = '|', header = T, fileEncoding = 'utf-8', stringsAsFactors = F);
datar <- rbind(data1,data2,data3);
fix(datar);
#subset
datab <- subset(datar,datar[,2]>10000);
fix(datab);
#字段的合并paste(x,x1,x2)
data4 <- read.table('2.csv',sep = ' ');
fix(data4);
data5 <- paste(data4[,1],data4[,2],data4[,3], sep = '');
new_data5 <- data.frame(data4,data5);
fix(new_data5);
#字段匹配类似excel的vlookup merge(x,y,by.x=c(),by.y=())
items6 <- read.table('3_1.csv',sep = '|', header = F, fileEncoding = 'utf-8');
fix(items6);
price <- read.table('3_2.csv',sep = '|', header = F, fileEncoding = 'utf-8');
colnames(items6);
itmesprice <- merge(price, items6, by.x=c('V1'), by.y=c('V1'));
itmesprice <- merge(items6,price, by.x=c(names(items6)[1]), by.y=c('V1'));
fix(itmesprice);
#3.4 简单计算
getwd();
dada <- read.csv('1.csv',fileEncoding = 'utf-8',header = T, stringsAsFactors = F, sep = '|');
colnames(dada);
cost <- dada$price*dada$num;
new_dada <- data.frame(dada,cost);
fix(new_dada);
#数据标准化 一般指01标准化
dada2 <- read.csv('2.csv',fileEncoding = 'utf-8');
View(dada2);
colnames(dada2);
scale <- (dada2$score-min(dada2$score))/(max(dada2$score)-min(dada2$score));
new_dada2 <- data.frame(dada2,scale);
fix(new_dada2);
#数据分组
cc <- read.csv('3.csv',header = T,sep = '|',fileEncoding = 'utf-8');
options(digits = 15);
cc;
fix(cc);
level <- ifelse(
cc$cost <=20,'(0,20)',
ifelse(
cc$cost <= 40,'(20,40)',
ifelse(
cc$cost <= 60 , '(40,60)',
ifelse(
cc$cost <= 80,'(60,80)','(80-以上'
)
)
)
);
level;
cc1 <- data.frame(cc,level);
fix(cc1);
#3.5 日期处理
#日期转换posixit
strdata <- '2016-4-28';
posixlt <- as.POSIXlt(strdata,format = '%Y-%m-%d');
posixlt;
strdata2 <- '2016/4/29';
posixlt <- as.POSIXlt(strdata2, format = '%Y/%m/%d');
posixlt;
#日期格式化
newstrdata <- format(posixlt,format = '%Y-%m-%d');
newstrdata;
#日期抽取
xxx <- read.csv('1.csv',header = T);
fix(xxx);
pos <- as.POSIXlt(xxx$注册时间 ,format = '%Y-%m-%d');
fix(www);
yeas <- www$year +1900 ;
mon <- www$mon + 1;
newwww <- data.frame(www,yeas,mon);
View(newwww);
#4.1数据分析
#基本统计 计数 求合 平均值 summary( ) length sum mean var sd
getwd();
.libPaths();
setwd('D:\\RStudio\\www');
getwd();
ali <- read.csv('1.csv',fileEncoding = 'utf-8');
ali;
summary(ali$score);
#计数
length(ali$score);
#求平均值
mean(ali$score);
#最大值
max(ali$score);
#最小值
min(ali$score);
#方差
var(ali$score);
#标准差
sd(ali$score);
#求合
sum(ali$score);
#4.2 分组分析 相当于excel的数据透视表 aggregate(统计量~ )
aggregate(ali$name~ali$class,data = ali,FUN = length);
aggregate(ali$score~ali$class,data = ali,FUN = sum);
aggregate(ali$score~ali$class,data = ali,FUN = mean);
colnames(ali);
#4.3 交叉分析tapply(统计量,list(纵轴行,横轴列),FUN=统计函数) 数据透视表
用户明细 <- read.csv('用户明细.csv',stringsAsFactors = F);
fix(用户明细);
年龄分组 <- ifelse(
user$年龄 <= 20 ,'20岁及20岁以下',
ifelse(
用户明细$年龄 <= 30 ,'21岁至29岁','30岁及以上'
)
)
colnames(用户明细);
fix(年龄分组);
用户明细 <- data.frame(用户明细,年龄分组);
fix(用户明细);
tapply(用户明细$用户ID, list(用户明细$年龄分组,用户明细$性别),FUN = length);
#结构分析 prop.table()
getwd();
setwd('D:\\RStudio\\www');
bibi <- read.csv('5.csv',stringsAsFactors = F,fileEncoding = 'utf-8');
fix(bibi);
colnames(bibi);
bibi1 <- tapply(bibi$月消费.元., list(bibi$通信品牌), length);
bibi1;
prop.table(bibi1);
bibi1 <- tapply(bibi$月消费.元., list(bibi$省份, bibi$通信品牌), length);
bibi1;
prop.table(bibi1,margin = 1);#百分比显示
#5.1数据可视化
#饼图 pie
bibi1 <- tapply(bibi$月消费.元., list(bibi$通信品牌), length);
bibi1;
p <- prop.table(bibi1);
label <- paste(names(p),round(p*100,2) ,'%',sep = '');
pie(bibi1,label=label,main = '通信品牌用户结构图');
#散点图 plot(x,y,main,sub,xlab,ylab,col)
datam <- read.csv('data.csv',header= T);
colnames(datam);
plot(
datam$广告费用,
datam$购买用户数,
main = '相关分析',
sub = '广告费用和用户数之间的关系',
xlab = '广告费用',
ylab = '购买用户数',
col = 'red'
);
#折线图plot(x,y,main,sub,xlab,ylab,col,type)
ds <- as.POSIXlt(datam[,1]);
year <- ds$year+1900;
moth <- ds$mon+1;
yearm <- paste(year,'年',moth,'月',sep = '');
plot(
yearm,
datam$购买用户数,
main = '相关分析',
sub = '广告费用和用户数之间的关系',
xlab = '广告费用',
ylab = '购买用户数',
col = 'red'
);
#地图地址函数 map(database,fill = F,col) 地图标注函数 text(x,y,text,cex) cex 字体的放大缩小
library(maps);
install.packages(maps);
install.packages('maps');
.libPaths("d:/R/R-3.2.3/library");
.libPaths();
install.packages('maps',lib='d:/R/R-3.2.3/library');
installed.packages();
path.package();
library(maps);
library(mapdata);
install.packages('mapdata',lib = 'd:/R/R-3.2.3/library' );
library('mapdata');
m <- map('state',fill = F);
m$names;
c <- map('china',fill = F);
# 准备地图数据
install.packages('maptools');
library(maps);
library(mapdata);
library(maptools);
china_map <- readShapePoly('bou2_4p.shp');# 读取地图空间数据
plot(china_map);
install.packages('ggplot2');
library(ggplot2);
# 用ggplot绘制
install.packages('mapproj');
ggplot(china_map,aes(x=long,y=lat,group=group)) +
geom_polygon(fill="white",colour="grey") +
coord_map("polyconic");
x <- china_map@data ;#读取行政信息
xs <- data.frame(x,id=seq(0:924)-1); #含岛屿共925个形状
china_map1 <- fortify(china_map);#转化为数据框
library(plyr);
china_map_data <- join(china_map1, xs, type = "full"); #合并两个数据框 提示:Joining by: id
# 准备业务数据
NAME <- unique(china_map@data$NAME);
mydata1 <- read.csv('www.csv');#读取省份数据
mydata <- data.frame(NAME,mydata1)
ccc <- runif(34,min=1,max=100) ;
mydata <- data.frame(mydata,ccc);
china_data <- join(china_map_data, mydata, type="full") ; #合并两个数据框 提示Joining by: NAME
# 绘制地图
# 现在可以开始试试画填色地图了
ggplot(china_data, aes(x = long, y = lat, group = group, fill = ccc)) +
geom_polygon(colour="grey40")+
scale_fill_gradient(low="white",high="steelblue") + #指定渐变填充色,可使用RGB
coord_map("polyconic") #指定投影方式为polyconic,获得常见视角中国地图
#利用sheme 函数清除不必要元素
ggplot(china_data, aes(x = long, y = lat, group = group,fill = ccc)) +
geom_polygon(colour="grey40") +
scale_fill_gradient(low="white",high="steelblue") + #指定渐变填充色,可使用RGB
coord_map("polyconic") + #指定投影方式为polyconic,获得常见视角中国地图
theme( #清除不需要的元素
panel.grid = element_blank(),
panel.background = element_blank(),
axis.text = element_blank(),
axis.ticks = elemen
t_blank(),
axis.title = element_blank(),
legend.position = c(0.2,0.3)
)
#导出文件 write.table(x,file = '',sep = '',row.names = T,col.names = T,quote = T(string是否用字符扩起来))
#sep(from,to,by,length.out = 最大长度)生成任意步长的数例
#rep(x,times) 生成任意次数的重复向量
用'demo()'来看一些示范程序,用'help()'来阅读在线帮助文件,或
用'help.start()'通过HTML浏览器来看帮助文件。
用'q()'退出R.
变量 命令 参数设置工作空间
> x <- 10;
> y <- x/7;
> y
[1] 1.428571
> options(digits=10)
> y
[1] 1.428571429
> options(digits=20)
> y
[1] 1.4285714285714286
向量 列表框
vector frame
vector 定义: c() 限制:行列的数据要一样 访问:f[]
frame 定义: data.frame() 限制:列的数据要一样 访问:f[]
fix() 可视化列表框
read.csv("first.csv")
read.table("first.txt",header = TRUE ,seq = "\t" )
read.excel()
read.excel2007()
RODBC
odbcconnectexcel()
install.packages("RODBC")
libary(RODBC)
s = odbcconnectexcel("first.xls")
sqlfecth(s,sheet1)
无法安装rodbc
可以试下执行:Sys.setlocale(category = "LC_ALL", locale = "us")
win+r 运行lusrmgr.msc 修改用户名
数据的导出:
数据清洗
去重
bbc <-read.csv('1.csv',encoding = 'UTF-8');encoding = 'utf-8")
bbc <- unique(bbc)
na.omit()
去掉空值
str_split_fixed(x,split,n)
安装包,指定安装包的路径
绡卸安装包
>