getwd() 显示当前的工作目录
setwd("mydirectory") 修改当前的工作目录为mydirectory
dir.create() 创建新目录
ls() 列出当前工作空间中的对象
rm(objectlist) 移除(删除)一个或多个对象
save.image("myfile") 保存工作空间到文件myfile中(默认值为.RData)
save(objectlist, file="myfile") 保存指定对象到一个文件中
load("myfile") 读取一个工作空间到当前会话中(默认值为.RData)
q() 退出R。将会询问你是否保存工作空间
summary
stat.desc
describe
cor(x, use= , method= ) 计算相关系数
#偏相关 偏相关是指在控制一个或多个定量变量时,另外两个定量变量之间的相互关系。你可以使用ggm包中的pcor()函数计算偏相关系数。ggm包没有被默认安装,在第一次使用之前需要先进行安装。函数调用格式为: pcor(u, S) 其中的u是一个数值向量,前两个数值表示要计算相关系数的变量下标,其余的数值为条件变量量(即要排除影响的变量)的下标。s为变量的协方差阵。这个示例有助于阐明用法: library (ggm) > colnames (states) [1] "Population" "Income" "Illiteracy" "Life Exp" "Murder" "HS Grad" > pcor(c(1,5,2,3,6), cov(states)) [1] 0.346 本例中,在控制了收入、文盲率和高中毕业率的影响时,人口和谋杀率之间的相关系数为0.346,偏相关系数常用于社会科学的研究中。
cor.test(x, y, alternative = , method = ) #相关性的显著性检验
函数complete.cases()可以用来识别矩阵或数据框中没有缺失值的行。若每行都包含完整的实例,则返回TRUE的逻辑向量;若每行有一个或多个缺失值,则返回FALSE。 以睡眠数据集为例: # 加载数据集 data(sleep, package="VIM") #列出没有缺失值的行 sleep[complete.cases(sleep),] #列出有一个或多个缺失值的行 sleep [!complete.cases (sleep),] 输出结果显示42个实例为完整数据,20个实例含一个或多个缺失值。 由于逻辑值TRUE和FALSE分别等价于数值1和0,可用sum()和mean()函数来获取关于缺失数据的有用信息。如: >sum(is.na(sleep$Dream)) [1]12 > mean(is.na(sleep$Dream)) [1] 0.19 > mean(!complete.cases(sleep)) [1]0.32 结果表明变量Dream有12个缺失值,19%的实例在此变量上有缺失值。另外,数据集中32%的实例包含一个或多个缺失值。
代码清单5—7 一个switch示例 > feelings <- c("sad", "afraid") > for (i in feelings) print( switch(i, happy = "I am glad you are happy", afraid = "There is nothing to fear", sad = "Cheer up", angry = "Calm down now" ) ) [1] "Cheer up" [1] "There is nothing to fear"
#(1) aggregate(X, by, FUN) aggregate(trees,by=list(trees$type),FUN=mean) #等效 attach(trees) aggregate(trees,by=list(type),FUN=mean) aggregate(trees,by=list(type,age),FUN=mean) aggregate(trees,by=list(type,age),FUN=sum) aggregate(trees,by=list(type,age),FUN=max) aggregate(trees,by=list(type,age),FUN=min)
# (2) aggregate(formula, data, FUN) aggregate(.~type,data=trees,FUN=mean) aggregate(Girth~type,data=trees,FUN=mean) aggregate(cbind(Girth,Height)~type,data=trees,FUN=mean) aggregate(Girth+Height~type,data=trees,FUN=mean) aggregate(Girth~type+age,data=trees,FUN=mean)
name <- c(rep(1:4,each=3)) year <- c(rep(1990:1993,3)) x1 <- sample(1:100,12) x2 <- runif(12) x3 <- runif(12)+5 mydata <- data.frame(name,year,x1,x2,x3) mydata library(reshape2) melt(mydata,id=c("name")) melt(mydata,id=c("name","year"))
#insta11.packages('reshape') library(reshape) cast(md,name+year~variable) cast(md,name+variable~year) cast(md,name~variable+year) cast(md,name~variable,mean) cast(md,name~year,sum)
(1)一个相同的名称scudent_ID df1<-data.frame(student_ID=c("004" , "002","003","001"),Math=c(98,77,90,87)) df2<-data.frame(student_ID=c("003", "005","001"), English=c(89,92,80)) #交集(内连接) merge(df1, df2,by='student_ID') #并集(全连接) merge(df1,df2,by='student_ID',all=TRUE) #左连接 merge(df1,df2,by='student_ID', all.x=TRUE) #右连接 merge(df1,df2,by='student_ID',all.y=TRUE)
(2)两个相同的名称 #(2)两个相同的名称 df1 <-data.frame(student_ID=c("004","002","003","001"),name=c("李四","王二","张三","吴一"),math=c(98,77,90,87)) df2 <- data.frame(student_ID=c("003","005","001"),name=c("张三","刘五","吴一"),English=c(89,92,80)) merge(df1,df2,by=c('student_ID','name'),all=TRUE)
#(3)一个不同的名称 df1<-data.frame(student_ID_1=c("004", "002","003","001"), Math=c(98,77,90,87)) df2<-data.frame(student_ID_2=c("003","005","001"), English=c(89,92,80)) merge(df1, df2, by.x="student_ID_1",by.y="student_ID_2",al1=TRUE) (4)不同的名称匹配 df1<-data.frame(student_ID_1=c("004","002","003","001"), name_1=c("李四","王二","张三","吴一"),Math=c(98,77,90,87)) df2<-data.frame(student_ID_2=c("003","005","001"), name_2=c("张三","刘五","吴一"),English=c(89,92,80)) merge(df1, df2, by.x=c('student_ID_1','name_1'), by.y =c('student_ID_2', 'name_2'),al1=TRUE)
install.packages('dplyr') library(dplyr)| df1<-data.frame(student_ID=c("004","002","003", "001"), Math=c(98,77,90,87)) df2<-data.frame(student_ID=c("003","005","001"), English=c(89,92,80)) #交集(内连接) inner_join(df1, df2,by="student_ID') #并集(全连接) fu11_join(df1, df2,by='student_ID') #左连接 left_join(df1, df2,by='student_ID') #右连接 right_join(df1, df2,by='student_ID')
#半连接(仅保留匹配后x数据) semi_join(df1,df2,by='student_ID') semi_join(df2, df1,by='student_ID') #反连接(保留x中来与y匹配的数据) anti_join(df1, df2,by=' anti_join(df2,df1,by='student_ID
数据筛选:通过某种规则筛选/提取出符合规则的数据子集。 ·which(筛选规则) ·输出内容为对应的行号,提取改行需 data[which(),] ·subset(data,筛选规则,需输出的列名) ·需输出的列名:向量,若不设置,即表示输出对应数据的所有列 ·filter(data,筛选规则1,筛选规则2...) ·不同规则之间逻辑关系为“且” class <- c("一班","二班","三班","四班") name <- c("tom","jerry","piggy","chris") math <- c(98,87,90,77) chinese <- c(77,82,98,88) df <- data.frame(class,name,math,chinese) m <- which(df$math>=90) which(df$chinese>70 & df$math>90) df[m,] which(df$name%in%c("tom","piggy"))
subset(df,df$math>80) subset(df,df$math>80&df$chinese>78)
library("dplyr") filter(df,df$math>90) filter(df,df$math>90,df$class=="一班") filter(df,df$math>90,df$chinese>70,df$class=="一班")
a <- c(1,3,56,2,78,34) order(a) order(a,decreasing = TRUE)
a <- c(1,3,56,2,78,34) sort(a) sort(a,decreasing = TRUE)
b <- c(1,5,7,4,5,3,7) rank(b) rank(b,ties.method = "first") rank(b,ties.method = "last") rank(b,ties.method = "random") rank(b,ties.method = "average") rank(b,ties.method = "max") rank(b,ties.method = "min")
library("dplyr") arrange(mtcars,mpg) #升序 arrange(mtcars,desc(mpg)) #降序
name <- c("Tom","jerry","piggy","Chris") nchar(name)
name <- c("Tom","jerry","piggy","Chris") toupper(name)
name <- c("Tom","jerry","piggy","Chris") tolower(name)
paste("M",1:6) paste("M",1:6,sep="")
paste0("m",1:5) paste0("m",1:5,collapse = "-")
substr("今天运气真好啊!",3,6) substr(c("I love you!","今天运气真好啊!"),3,6) b <- c("今天运气真好啊!") substr(b,3,4) <- c("天气") b
strsplit("今天.运气.真好.啊",".",fixed = TRUE) strsplit(c("I.love.you!","今天.运气.真好.啊"),".",fixed = TRUE) strsplit(c("adcdefg"),"",fixed = TRUE) b <- strsplit(c("I.love.you!","今天.运气.真好.啊"),".",fixed = TRUE) b unlist(b)
data <- c("abc2","ghty","c2ef") grep("c2",data)