【R语言】R语言备忘录

2024-04-22

一些常见的R语言命令,主要是基于Baser,后面我会继续整理基于R语言dplyr包处理数据的技巧方法

0.目录

Table of contents generated with markdown-toc

1.Load R packages

library(ggplot2)
library(tidyr)
library(RColorBrewer) 
library(dplyr)
library(ggstatsplot)

2.set path & import data

rm(list=ls())
setwd("D:/project/Feb2023_Lumpy/data")
dir()
load("project_data3.RData")
head(data)
data <- as.data.frame(data)
data$lnP <- log(1+data$P)
data$lnQ <- log(1+data$Q)
data$lnV <- log(1+data$V)

image

3.Identify missing values

data$lnpif<-is.na(data$lnP)
data$lnqif<-is.na(data$lnQ)
data$lnvif<-is.na(data$lnV)
data$ifNA <- complete.cases(data)
data_heal <- data[data$lnqif==FALSE&data$lnvif==FALSE&data$lnpif==FALSE, c("id","hs8","Year","capt","T_hs08") ]

4.conditional assignment

data_heal$type[data_heal$capt == 1 & data_heal$Year >= 2007 ] <- "hg2" 
data_heal$type[data_heal$capt == 1 & data_heal$Year < 2007 ] <- "hg3" 
data_heal$type[data_heal$capt != 1 & data_heal$Year >= 2007 ] <- "hg1" 
data_heal$type[data_heal$capt != 1 & data_heal$Year < 2007 ] <- "hg4" 
table(data_heal$type,data_heal$Year)

image

5.tapply (grouped calculation)

data_heal_ <- data[is.na(data_heal$T_hs08) == FALSE & is.nan(data_heal$T_hs08) == FALSE , c("id","hs8","Year","capt","T_hs08") ]
data_heal_$group_check <- as.double(runif(dim(data_heal_)[1]))
data_heal_$group[data_heal_$group_check <= 0.25] <- "1"
data_heal_$group[data_heal_$group_check > 0.25 & data_heal_$group_check <= 0.5] <- "2"
data_heal_$group[data_heal_$group_check >0.5& data_heal_$group_check <= 0.75] <- "3"
data_heal_$group[data_heal_$group_check >0.75] <- "4"
data_heal_$lnT <- log(1+ data_heal_$T_hs08)
tapply(data_heal_$lnT,data_heal_$group,"mean",na.rm = TRUE) 
mean(data_heal_$lnT[data_heal_$group == "2"],na.rm = TRUE)
tapply(data_heal_$lnT,list(data_heal_$group,data_heal_$Year),"mean",na.rm = TRUE) 

# manager <- c(1, 2, 3, 4, 5)
# country <- c("US", "US", "UK", "UK", "UK")
# gender <- c("M", "F", "F", "M", "F")
# age <- c(32, 45, 25, 39, 99)
# leadership <- data.frame(manager, country, gender, age)
# tapply(leadership$age, leadership$country, mean)	# 求在不同country水平下的age的均值

image image

image

6.quantile and summary

summary(data,maxsum = 20,  quantile.type = 7)
quantile(data$lnP,seq(from = 0 ,to = 1 , by = 0.1),na.rm = TRUE)

image

7.origin-data (coefplot in stata)

fitmod_T <- estimatr::lm_robust(T_hs08 ~ as.factor(Year) , data = data_heal[data_heal$capt !=0, ], se_type= "stata") 
summary(fitmod_T)  
ggcoefstats(fitmod_T,exclude.intercept = TRUE,stats.labels = FALSE)

image image

8.dataclear in hs8-year level (average&reg)

# average for T_hs08
data_heal2 <- aggregate(data_heal$T_hs08,list(data_heal$hs8,data_heal$Year),mean)

# rename
names(data_heal2) <- c("hs8","year","meanT")

# Extract non-missing values
data_heal2 <- data_heal2[is.na(data_heal2$meanT) != TRUE, c("hs8","year","meanT")]

# Logarithmic
data_heal2$logT <- log(1+data_heal2$meanT)

# regress
fitmod_T2 <- estimatr::lm_robust(logT ~ as.factor(year),data = data_heal2, se_type = "stata")
summary(fitmod_T2)  

# plot the coef
ggcoefstats(fitmod_T2,exclude.intercept = TRUE,stats.labels = FALSE,vline = TRUE)
boxplot(data_heal2$logT ~ as.factor(data_heal2$year))

image image

image

9.dataclear in hs8-year level


data0 <- data[data$capt == 0 ,]
data0_P <- data.frame(aggregate(data0$lnP,list(data0$Year,data0$hs8),mean))
data0_Q <- data.frame(aggregate(data0$lnQ,list(data0$Year,data0$hs8),mean))
data0_V <- data.frame(aggregate(data0$lnV,list(data0$Year,data0$hs8),mean))
names(data0_P) <-c("year","hs","meanP")
names(data0_Q) <-c("year","hs","meanQ")
names(data0_V) <-c("year","hs","meanV")
data0_ <- merge(data0_P,data0_Q,by.x = c("year","hs"),by.y = c("year","hs"))
data0_ <- merge(data0_,data0_V,by.x = c("year","hs"),by.y = c("year","hs"))

data1 <- subset(data,data$capt == 1 )
data1_P <- data.frame(aggregate(data1$lnP,list(data1$Year,data1$hs8),mean))
data1_Q <- data.frame(aggregate(data1$lnQ,list(data1$Year,data1$hs8),mean))
data1_V <- data.frame(aggregate(data1$lnV,list(data1$Year,data1$hs8),mean))
names(data1_P) <-c("year","hs","meanP")
names(data1_Q) <-c("year","hs","meanQ")
names(data1_V) <-c("year","hs","meanV")
data1_ <- merge(data1_P,data1_Q,by.x = c("year","hs"),by.y = c("year","hs"))
data1_ <- merge(data1_,data1_V,by.x = c("year","hs"),by.y = c("year","hs"))

data2 <- subset(data,data$capt == 2 )
data2_P <- data.frame(aggregate(data2$lnP,list(data2$Year,data2$hs8),mean))
data2_Q <- data.frame(aggregate(data2$lnQ,list(data2$Year,data2$hs8),mean))
data2_V <- data.frame(aggregate(data2$lnV,list(data2$Year,data2$hs8),mean))
names(data2_P) <-c("year","hs","meanP")
names(data2_Q) <-c("year","hs","meanQ")
names(data2_V) <-c("year","hs","meanV")
data2_ <- merge(data2_P,data2_Q,by.x = c("year","hs"),by.y = c("year","hs"))
data2_ <- merge(data2_,data2_V,by.x = c("year","hs"),by.y = c("year","hs"))

data0_$BEC <- 0
data1_$BEC <- 1
data2_$BEC <- 2
data_ <- rbind(data0_,data1_,data2_)

rm(data0_P,data0_Q,data0_V,data1_P,data1_Q,data1_V,data2_P,data2_Q,data2_V,data0,data1,data2)


data_$lnmeanQ <- log(data_$meanQ)
data_$lnmeanP <- log(data_$meanP)
data_$lnmeanV <- log(data_$meanV)

10.scatter

ggplot(data_, aes(lnmeanP, lnmeanV, color = BEC))+
  geom_point(size = 2.0, shape = 16)

data_cha <- data.frame(subset((data_),data_$BEC != 0))
ggplot(data_cha, aes(lnmeanP, lnmeanV, color = BEC))+
  geom_point(size = 2.0, shape = 16)

image

image

11.density

11.1density-type1
p <- ggplot(data = data_, mapping = aes(
  x = lnmeanV, y = year))
p + geom_jitter(width = 0, height = 0.3, alpha = 0.5)

image

11.2density-type2
11.2.1 no-shade
p <- ggplot(data = data_, mapping = aes(
  x = lnmeanV, color = year))
p + geom_line(stat = "density")


data_$year_str <- as.character(data_$year)

p <- ggplot(data = data_, mapping = aes(
  x = lnmeanV, color = year_str))
p + geom_line(stat = "density")

image image

11.2.2 shade
p <- ggplot(data = data_, mapping = aes(
  x = lnmeanV, color = year_str, fill = year_str))
p + geom_density(alpha = 0.4)

image

12.summarize the data

summary(data_,maxsum = 20,  quantile.type = 7)

image

13.regress

n <- 10
x <- rnorm(n,1)
y <- rnorm(n,1)
fit_try <- lm(y ~ x)
summary(fit_try)


data_$ifNA <- complete.cases(data_)
data_reg <- as.data.frame(subset(data_,data_$ifNA == TRUE))

# fit <- lm(lnmeanV ~ lnmeanP, data =data_)
# summary(fit)   

fit2 <- lm(lnmeanV ~ lnmeanP, data =data_[data_$BEC==1, ])     
summary(fit2)   

fit3 <- lm(lnmeanV ~ lnmeanP, data =data_[data_$BEC !=0, ])     
summary(fit3)   

fit4 <- estimatr::lm_robust(lnmeanV ~ lnmeanP , data = data_[data_$BEC !=0, ], se_type= "stata") 
summary(fit4)   

fit5 <- estimatr::lm_robust(lnmeanV ~ lnmeanP , data = data_[data_$BEC !=0, ], clusters = hs) 
summary(fit5)   

image image image image image

14.reg R versus stata

# 简单回归
# reg lwage i.numdep          
# reg lwage c.educ#c.exper    
# reg lwage c.educ##c.exper   
# reg lwage c.exper##i.numdep 
# 
# lm(lwage ~ as.factor(numdep), data= wage1)         
# lm(lwage ~ educ:exper, data =wage1)                 
# lm(lwage ~ educ*exper, data =wage1)                 
# lm(wage ~ exper*as.factor(numdep),data = wage1) 

fit6 <- estimatr::lm_robust(lnmeanV ~ as.factor(year) , data = data_[data_$BEC !=0, ], se_type= "stata") 
summary(fit6)  

ggcoefstats(fit6,exclude.intercept = TRUE,stats.labels = FALSE)

# coefplot
# p <- ggplot(data = ddsmall_summ, mapping = aes(
#   x = cut,
#   y = mean_carat ))
# p + geom_point(size = 1.2) +
#   geom_pointrange(mapping = aes(
#     ymin = mean_carat - sd_carat,
#     ymax = mean_carat + sd_carat)) +
#   labs(x = "Cut",
#        y = "Carat")



#### 面板回归

# xtset id year
# xtdescribe
# xtsum
# xtreg mrdrte unem, fe
# 
# plm::is.pbalanced(murder$id,murder$year)
# modfe <- plm::plm(mrdrte ~ unem,index = c("id", "year"),model ="within", data = murder)
# summary(modfe)

15.dataclear for bar & line


Data0_P <- data.frame(aggregate(data0_$meanP,list(data0_$hs),mean))
Data0_Q <- data.frame(aggregate(data0_$meanQ,list(data0_$hs),mean))
Data0_V <- data.frame(aggregate(data0_$meanV,list(data0_$hs),mean))

Data1_P <- data.frame(aggregate(data1_$meanP,list(data1_$hs),mean))
Data1_Q <- data.frame(aggregate(data1_$meanQ,list(data1_$hs),mean))
Data1_V <- data.frame(aggregate(data1_$meanV,list(data1_$hs),mean))

Data2_P <- data.frame(aggregate(data2_$meanP,list(data2_$hs),mean))
Data2_Q <- data.frame(aggregate(data2_$meanQ,list(data2_$hs),mean))
Data2_V <- data.frame(aggregate(data2_$meanV,list(data2_$hs),mean))

names(Data0_P) <- c("hs","meanP")
names(Data0_Q) <- c("hs","meanQ")
names(Data0_V) <- c("hs","meanV")
names(Data1_P) <- c("hs","meanP")
names(Data1_Q) <- c("hs","meanQ")
names(Data1_V) <- c("hs","meanV")
names(Data2_P) <- c("hs","meanP")
names(Data2_Q) <- c("hs","meanQ")
names(Data2_V) <- c("hs","meanV")

Data0_ <- merge(Data0_P,Data0_Q,by.x = c("hs") ,by.y = c("hs"))
Data0_ <- merge(Data0_,Data0_V,by.x = c("hs") ,by.y = c("hs"))
Data1_ <- merge(Data1_P,Data1_Q,by.x = c("hs") ,by.y = c("hs"))
Data1_ <- merge(Data1_,Data1_V,by.x = c("hs") ,by.y = c("hs"))
Data2_ <- merge(Data2_P,Data2_Q,by.x = c("hs") ,by.y = c("hs"))
Data2_ <- merge(Data2_,Data2_V,by.x = c("hs") ,by.y = c("hs"))

rm(Data0_P,Data0_Q,Data0_V,Data1_P,Data1_Q,Data1_V,Data2_P,Data2_Q,Data2_V)







Data0_P <- data.frame(aggregate(data0_$meanP,list(data0_$year),mean))
Data0_Q <- data.frame(aggregate(data0_$meanQ,list(data0_$year),mean))
Data0_V <- data.frame(aggregate(data0_$meanV,list(data0_$year),mean))

Data1_P <- data.frame(aggregate(data1_$meanP,list(data1_$year),mean))
Data1_Q <- data.frame(aggregate(data1_$meanQ,list(data1_$year),mean))
Data1_V <- data.frame(aggregate(data1_$meanV,list(data1_$year),mean))

Data2_P <- data.frame(aggregate(data2_$meanP,list(data2_$year),mean))
Data2_Q <- data.frame(aggregate(data2_$meanQ,list(data2_$year),mean))
Data2_V <- data.frame(aggregate(data2_$meanV,list(data2_$year),mean))

names(Data0_P) <- c("year","meanP")
names(Data0_Q) <- c("year","meanQ")
names(Data0_V) <- c("year","meanV")
names(Data1_P) <- c("year","meanP")
names(Data1_Q) <- c("year","meanQ")
names(Data1_V) <- c("year","meanV")
names(Data2_P) <- c("year","meanP")
names(Data2_Q) <- c("year","meanQ")
names(Data2_V) <- c("year","meanV")

DatA0_ <- merge(Data0_P,Data0_Q,by.x = c("year") ,by.y = c("year"))
DatA0_ <- merge(DatA0_,Data0_V,by.x = c("year") ,by.y = c("year"))
DatA1_ <- merge(Data1_P,Data1_Q,by.x = c("year") ,by.y = c("year"))
DatA1_ <- merge(DatA1_,Data1_V,by.x = c("year") ,by.y = c("year"))
DatA2_ <- merge(Data2_P,Data2_Q,by.x = c("year") ,by.y = c("year"))
DatA2_ <- merge(DatA2_,Data2_V,by.x = c("year") ,by.y = c("year"))

rm(Data0_P,Data0_Q,Data0_V,Data1_P,Data1_Q,Data1_V,Data2_P,Data2_Q,Data2_V)

16.bar

ggplot(DatA2_, aes(x=year, y=meanQ))+
  geom_bar(position=position_dodge(), stat="identity") +
  scale_y_continuous(breaks=0:20*4) +
  scale_x_continuous(breaks=0:10*0.5)+
  theme_bw()+
  theme(panel.grid.major=element_line(colour=NA),
        panel.background = element_rect(fill = "transparent",colour = NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        panel.grid.minor = element_blank(),legend.justification=c(0.85,0.1),legend.position=c(0.85,0.1))

DatA0_$BEC <- 0
DatA1_$BEC <- 1
DatA2_$BEC <- 2
DatA <- rbind(DatA0_,DatA1_,DatA2_)

ggplot(DatA,aes(x=year,y=meanQ,fill=BEC))+
  geom_bar(stat='identity', position='stack') +
  theme_bw() +
  labs(x = 'Specie',y = 'Value', title = 'This is barplot') +
  theme(axis.title =element_text(size = 16),axis.text =element_text(size = 14, color = 'black'),plot.title =element_text(hjust = 0.5, size = 20))


ggplot(DatA,aes(x=year,y=meanQ,fill=BEC))+
  geom_bar(stat='identity', position='dodge') +
  theme_bw() +
  labs(x = 'Specie',y = 'Value', title = 'This is barplot') +
  theme(axis.title =element_text(size = 16),axis.text =element_text(size = 14, color = 'black'),plot.title =element_text(hjust = 0.5, size = 20))

DatA$BEC_str <- as.character(DatA$BEC)

ggplot(DatA,aes(x=year,y=meanQ,fill=BEC_str))+geom_bar(position="dodge",stat="identity")+
  xlab("年份") + ylab("数量") + labs(fill="类型")

image

image

image

image

17.line

ggplot(DatA, aes(year, meanQ,group=BEC))+ 
  geom_path(lineend = "butt", linejoin = "round", stat = "identity",linemitre = 10) +
  geom_line(stat = "identity",position = "identity" ) 

image