Review

R Markdown

# Section 17
library(pscl)
library(InformationValue)

# Population per km2
Pop <- c(797,  3652,   384,   876,  1156, 
         5282,  3602,  4305,  6451, 939,  
         2725,   296,  1187,  4819,  7856,
         1074,  1444,  2620,   417,  3232)

# PM exceeding
PM  <- c( 0,     1,    0,   0,   0, 
          1,     0,    1,   1,   1,
          1,     0,    0,   0,   1,
          0,     0,    1,   0,   1)
# Make a data frame
PM_data <- data.frame(Pop, PM)
str(PM_data)

# Fit the regression model
logistic <- glm(PM~Pop,data=PM_data,family = binomial)
# Print model detail
summary(logistic)
# check effect of model
pR2(logistic)["McFadden"]

# Setting probability cutoff
# Find optimal cutoff probability to use to maximize accuracy
predicted_value <- predict(logistic, PM_data, type = "response")
optimalCutoff(PM_data$PM,predicted_value)[1]

# Define new cities where we want to make predictions
new_cities <- data.frame(Pop = c(1000,5000))
predict(logistic, new_cities, type = "response")

#---------------------------------------------------------

# Section 18
# Read data
Keeling_Data <- read.csv(file = "co2_mm_mlo.csv", header = T)
Keeling_Data$co2[which(Keeling_Data$co2<0)] <- NA
for(i in 1:length(Keeling_Data$co2)){
  if(is.na(Keeling_Data$co2[i])){
    Keeling_Data$co2[i] <- mean(Keeling_Data$co2[(i-2):(i+2)], na.rm=T)
  }
}

co2 <- ts(Keeling_Data$co2, start=c(1958,3),frequency = 12)

plot(co2, type="l")

co2_components <- decompose(co2)
plot(co2_components)

# check the distribution of the error component to varify it's a white noise

# plot hist
hist(co2_components$random, prob = TRUE)
# Add a normal pdf curve
curve(dnorm(x,mean=mean(co2_components$random, na.rm = T),
            sd = sd(co2_components$random, na.rm = T)),add = TRUE, col="red")
## ARIMA model
# make a AR model
par(mfrow=c(2,1))

# ARIMA model (1,0,0)
plot(arima.sim(list(order=c(1,0,0), ar=.9),n=100),ylab="x",
     main=(expression(AR(1)~~~phi==+.9)))

# ARIMA model (0,0,1)
plot(arima.sim(list(order=c(1,0,0), ar=-.9), n=100), ylab="x",
     main=(expression(AR(1)~~~phi==-.9)))

# An example: global temperature
library(astsa)
plot(globtemp, type="l", ylab="Global Temperature Deviations")
# de-trend the data
trModel <- lm(globtemp~c(1:length(globtemp)))
residual <- resid(trModel)
plot(residual, type="l")
# Check acf and pacf of residuals
acf(residual)
pacf(residual)

# the data of the Southern Oscillation Index
library(astsa)
tsplot(soi)
trmodel2<- lm(soi~c(1:length(soi)))
plot(decompose(soi))
residual <- resid(trmodel2)
plot(residual, type="l")
acf(residual)
pacf(residual)

#----------------------------------------------------------------------------
library(dplyr)
library(lubridate)
library(forecast)

# Read in the COVID-19 data
COVID_data <- read.csv(file = "COVID_2020_data.csv", header = T)

# Check the variable names
head(COVID_data)

# Convert the data.frame to a tibble 
COVID_tb1 <- as_tibble(COVID_data)

# Show data
COVID_tb1

# Get global daily new cases
COVID_tb1 <- COVID_tb1 %>%
  mutate(dateRep = as.Date(dateRep,format='%d/%m/%Y')) %>%
  group_by(dateRep) %>%
  summarise(global_cases = sum(cases))

# Filter the data, only use data from April 01
COVID_tb1 <- COVID_tb1 %>%
  filter(dateRep >= as.Date("2020-04-01"))
# Show data
COVID_tb1

## Convert a vector to a time series
# Start date of the time series, read from the .csv file
Date_start <- as.Date("2020-04-01")
Date_end <- as.Date("2020-11-09")

# Get the Julian Day of the start and end date
JD_start <- yday(Date_start)
JD_end <- yday(Date_end)
N_days <- JD_end - JD_start +1

# Convert the vector data to a time series 
global_cases_ts <- ts(COVID_tb1$global_cases[1:N_days], start = c(2020, JD_start), frequency = 365)
# The indicator of the time series
inds <- seq(Date_start, Date_end, by="day")

# Check structure
str(global_cases_ts)
plot(inds, global_cases_ts, type = "l")

## Transform the time series
# Data transform with log
global_cases_ts_log <- log(global_cases_ts)
# Plot time series
plot(inds,global_cases_ts_log, type = "l")

# Check acf and pacf
acf(global_cases_ts_log)
pacf(global_cases_ts_log)

# Take the difference to make the time series stationary
# Take the diff, d=1
global_cases_ts_log_d1 <- diff(global_cases_ts_log)
# Plot time series
plot(global_cases_ts_log_d1, type="l")

# Check the acf and pacf 
acf(global_cases_ts_log_d1)
pacf(global_cases_ts_log_d1)

# Automated forecasting using an ARIMA model
model <- auto.arima(global_cases_ts_log)
model

## Make predictions
# Number of days to predict
days_forecast <- 30

# Number of include in the plot 
days_in_plot <- 30

# Make predictions using the forecast() function
forecast_30days <- forecast(model,days_forecast)
plot(forecast_30days, include = days_in_plot,
     xlab = "Time", ylab="log(global cases)", type="o",lwd=2)
# Get predicted values
# Get predicted values on Nov 10,2020
day_forward <- yday(as.Date("2020-11-10")) - yday(Date_end)
exp(forecast_30days$mean[day_forward])
exp(forecast_30days$lower[day_forward,1])
exp(forecast_30days$upper[day_forward,1])

#--------------------------------------------------------------------------
# One sample t-test with R
Sample <- c(78,83,68,72,88)
t.test(Sample, mu=70, alternative="greater")

# Independent two-sample t-test with R, requires the two populations to have the same variance
SZ_PM2.5  <- c(25.6, 23.7, 21.9, 26.0, 24.5, 22.4, 26.7, 24.6, 22.7, 23.8)
GZ_PM2.5  <- c(27.1, 24.2, 27.9, 33.3, 26.4, 28.7, 25.6, 23.2, 24.0, 27.1, 26.2, 24.4)
t.test(SZ_PM2.5,GZ_PM2.5, alternative = "two.sided", var.equal = T)

# Paired sample t.test with R
# PM2.5 in 2019 Feb.
PM2.5_2019  <- c(53, 41, 26, 19, 26, 48, 64, 46, 42, 65, 53, 81, 51, 32, 44)
# PM2.5 in 2020 Feb.
PM2.5_2020  <- c(63, 33, 24, 19, 33, 49, 38, 28, 41, 36, 41, 43, 43, 23, 35)
t.test(PM2.5_2020,PM2.5_2019,alternative = "two.sided",paired = T)

# Welch two-sample t-test with R, still use SZ_PM2.5 and GZ_PM2.5 data, var.equal=F
t.test(SZ_PM2.5,GZ_PM2.5,alternative = "two.sided", var.equal = F)

#-----------------------------------------------------------------------------------
# Analysis of variance(ANOVA)
# One-way ANOVA
Control  <- c(0.64, 0.91, 0.84, 0.41, 1.58, 0.48, 0.88, 0.74, 1.09, 1.28)
P1       <- c(1.20, 1.17, 0.76, 0.92, 0.65, 1.14, 1.05, 1.28, 0.89, 1.06)
P2       <- c(0.25, 1.31, 0.43, 1.18, 1.02, 0.83, 1.02, 0.66, 0.70, 0.83)
P3       <- c(0.63, 0.30, 0.49, 0.01, 0.63, 0.69, 0.68, 0.34, 1.09, 0.42)
# Make data frame
Cd_data  <- data.frame(Cd=c(Control, P1, P2, P3),
                       group=c(rep("Control", length(Control)),
                               rep("Plant 1", length(P1)),
                               rep("Plant 2", length(P2)),
                               rep("Plant 3", length(P3))) )
oneway.test(Cd~group, data=Cd_data,var.equal = T)
# Another way to perform the one-way ANOVA with R
res_aov <- aov(Cd~group, data=Cd_data)
summary(res_aov)

# Post-hoc test with the Hukey test after one-way ANOVA
TukeyHSD(res_aov)
# Plot the 95% (default) CI of the difference
plot(TukeyHSD(res_aov))

## Two-way ANOVA with R
# Read csv
cd_data <- read.csv("cd_data.csv", header=T)

# Change to factor type using tibble
cd_data_tbl <- as_tibble(cd_data) %>% 
  mutate(Chemical = factor(Chemical, ordered = TRUE)) %>% 
  mutate(Physical = factor(Physical, ordered = TRUE)) %>% 
  mutate(Plant    = factor(Plant   , ordered = TRUE))   
# Quick check
glimpse(cd_data_tbl)
# Test the interaction term
two_way_interaction <- aov(Cd ~ Chemical * Plant, data = cd_data_tbl)
summary(two_way_interaction)
# Additive two-way ANOVA
two_way_additive <- aov(Cd ~ Chemical + Plant, data = cd_data_tbl)
summary(two_way_additive)
TukeyHSD(two_way_additive)
plot(TukeyHSD(two_way_additive))

#-----------------------------------------------------------------------------
# Covariance and Pearson correlation test
# Make up some random values
x <- rnorm(10,0,1)
y <- 10*x+rnorm(10,0.2,0.5)

# Compute covariance
cov(x,y)
# Pearson correlation test
cor.test(x,y,method = "pearson",alternative = "two.sided",conf.level = 0.95)

#-----------------------------------------------------------------------------
# No parametric test

# Mann-Whitney U test: two_sample independent t-test
library(BSDA)
library(FSA)
Treat <- c(3, 5, 1, 4, 3, 5)
Control <- c(4,8,6,2,1,9)
wilcox.test(Treat, Control, paired=F, alternative="two.sided")

# Wilcoxon signed rank test: paired t-test
wilcox.test(Treat, Control, paired = T, alternative = "two.sided")

# When the distribution of differences between paired data values is neither normal nor symmetrical,
# use the sign test
SIGN.test(Treat, Control, alternative = "two.sided", comf.level = 0.95)

# Kruskal-Wallis test: one-way ANOVA (extension of Mann-Whitney U test)
# independent and not paired observations or repeated measures data
require(graphics) #Load data
kruskal.test(Ozone~Month,data = airquality)
# To determine which groups are different from other groups, post-hoc testing can be conducted
dunnTest(Ozone~Month, data=airquality,method="bh") # Most common post-hoc test for the Kruskal-Wallis test


# Spearman correlation test
x <- rnorm(20,0,1)
y <- 2*x+rnorm(20,0,0.5)
# Perform the Spearman correlation test
cor.test(x, y, method="spearman", alternative="two.sided", conf.level=0.95)
# Kendall correlation test (when the sample size is small or there are many tied ranks)
cor.test(x,y,method = "kendall", alternative = "two.sided",conf.level = 0.95)

## Linear regression-----------------------------------------------------------------------------------
# Observations 
Soil_conc      <- c(10, 50, 20, 30, 80, 60, 70, 40)
Uptaken_amount <- c(0.18, 1.05, 0.50, 0.61, 1.58, 1.10, 1.36, 0.77)
# Compute the Pearson correlation coefficient
r              <- cor(Soil_conc, Uptaken_amount)
# Fit a simple linear regression model between
# dependent variable (Uptaken_amount) and
# independent variable (Soil_conc)
reg            <- lm( Uptaken_amount ~ Soil_conc )

# Print details of the linear model
summary(reg)

# Making predictions with the linear model
# Confidence band for mean response
predict(reg, interval="confidence", level=0.95)
# Prediction band for individual response
predict(reg, interval="prediction", level=0.95)

# Plot predictions
# Make data frame
Pesticide_data  <- data.frame(Soil_conc,Uptaken_amount)
# Build the model
reg             <- lm(Uptaken_amount ~ Soil_conc, data=Pesticide_data)
# Make predictions for individual responses
Pred_band       <- predict(reg, interval="prediction", level=0.95)
# Update data frame
Pesticide_data2 <- cbind(Pesticide_data, Pred_band)
# Plot
ggplot(Pesticide_data2, aes(Soil_conc, Uptaken_amount))+
  geom_point() +
  geom_line(aes(y=lwr), color = "red", linetype = "dashed")+
  geom_line(aes(y=upr), color = "red", linetype = "dashed")+
  geom_smooth(method=lm, se=TRUE)
# Make prediction
q <- data.frame(Soil_conc = 25)
predict(reg,q,interval = "prediction", level = 0.95)

# Multiple linear regression----------------------------------------------------
library(GGally)
library(olsrr)
# Read ozone and meteorological data
Ozone_data <- read.csv(file = "ozone_data.csv", header=T)
# Plot scatter plots
ggpairs(Ozone_data,columns=1:5)
# Fit the full model, where all independent variables are included 
full_model  <- lm(Ozone ~ Solar.Rad + Wind.Speed + Temperature + Pressure, data = Ozone_data)
# Test all possible subsets
output      <- ols_step_all_possible(full_model)
# Print results from all possible subsets
output
# Plot results from all possible subsets
plot(output)
# Auto selection
# Backward elimination
ols_step_backward_aic(full_model, details=F)
# Forward selection
ols_step_forward_aic(full_model, details=F)
# Stepwise regression
ols_step_both_aic(full_model, details=F)
# Making predictions with the model
reg <- lm(Ozone ~ Solar.Rad + Wind.Speed + Temperature , data = Ozone_data)
summary(reg)
# Make confidence band
predict(reg, interval="confidence", level=0.95)
# Make prediction band
predict(reg, interval="prediction", level=0.95)

## ANOVA procedure--------------------------------------------------------------
# 1.Check data: Independence, normality, outliers
# 2.Write down H0 and H1
# 3.Anova with R
# 4.Check homoscedasticity of residuals
par(mfrow=c(2,2))
plot(two_way_additive)
# 5.Tukey's test with R
TukeyHSD(two_way_additive)
plot(TukeyHSD(two_way_additive))
# 6.Report results

## Procedure of the simple linear regression------------------------------------
# 1.check data: indenpendence of the dependent variable(Y) && outliers
# 2.Write down H0 and H1
# 3.Test the slope
reg             <- lm(Uptaken_amount ~ Soil_conc, data=Pesticide_data)
summary(reg)
# 4.check residuals
par(mfrow = c(2, 2))
plot(reg)
# 5. report the results
Review

Yanan Zhao

2022-05-22

R Markdown