setwd("~/R TRAINING/session 5")
G2;H2;Warningh: The working directory was changed to C:/Users/USER/Documents/R TRAINING/session 5 inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.g
options(scipen=999)
##load the data
#Example:pearson correlation
data<-data.frame(x=c(1,2,3,4,5),
y=c(2,4,5,7,6))
data
pearson_corr<-cor(data$x, data$y,method="pearson")
print(paste("pearson correlation:",pearson_corr))
[1] "pearson correlation: 0.904194430179465"
pearson_test<-cor.test(data$x, data$y, method="pearson")
print(pearson_test,3)
Pearson's product-moment correlation
data: data$x and data$y
t = 4, df = 3, p-value = 0.04
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.108 0.994
sample estimates:
cor
0.904
reject null hypothesis
spearman_corr<-cor(data$x,data$y,method="spearman")
print(paste("spearman correlation:",spearman_corr))
[1] "spearman correlation: 0.9"
spearman_test<-cor.test(data$x, data$y, method = "spearman")
print(spearman_test,3)
Spearman's rank correlation rho
data: data$x and data$y
S = 2, p-value = 0.08
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.9
##pratical
library(ggplot2)
head(economics)
corr_matrix <- economics%>%
dplyr::select(pce,pop,psavert,unemploy)%>%
cor(use="pairwise.complete.obs")
corr_matrix
pce pop psavert unemploy
pce 1.0000000 0.9872421 -0.7928546 0.6145176
pop 0.9872421 1.0000000 -0.8363147 0.6337165
psavert -0.7928546 -0.8363147 1.0000000 -0.3093769
unemploy 0.6145176 0.6337165 -0.3093769 1.0000000
head(economics)
use im()to fit a simple linear regression model
data<-data.frame(x=c(1,2,3,4,5),
y=c(2,4,5,4,6))
print(data)
model<-lm(y~x,data=data)
summary(model)
Call:
lm(formula = y ~ x, data = data)
Residuals:
1 2 3 4 5
-0.6 0.6 0.8 -1.0 0.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.8000 0.9381 1.919 0.1508
x 0.8000 0.2828 2.828 0.0663 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8944 on 3 degrees of freedom
Multiple R-squared: 0.7273, Adjusted R-squared: 0.6364
F-statistic: 8 on 1 and 3 DF, p-value: 0.06628
##simple linear regression example using gapminder s=data set
library(gapminder)
data("gapminder")
head(gapminder)
model<-lm(lifeExp~ gdpPercap,data=gapminder)
summary(model)
Call:
lm(formula = lifeExp ~ gdpPercap, data = gapminder)
Residuals:
Min 1Q Median 3Q Max
-82.754 -7.758 2.176 8.225 18.426
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 53.95556088 0.31499494 171.29 <0.0000000000000002 ***
gdpPercap 0.00076488 0.00002579 29.66 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.49 on 1702 degrees of freedom
Multiple R-squared: 0.3407, Adjusted R-squared: 0.3403
F-statistic: 879.6 on 1 and 1702 DF, p-value: < 0.00000000000000022
data<-data.frame(y=c(2,4,5,4,6),
x1=c(1,2,3,4,5),
x2=c(3,5,7,6,8))
data
model<-lm(y~x1+x2,data=data)
summary(model)
Call:
lm(formula = y ~ x1 + x2, data = data)
Residuals:
1 2 3 4 5
-0.06667 0.33333 -0.26667 -0.20000 0.20000
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.4222 0.6748 -0.626 0.5954
x1 -0.1778 0.2703 -0.658 0.5784
x2 0.8889 0.2222 4.000 0.0572 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3651 on 2 degrees of freedom
Multiple R-squared: 0.9697, Adjusted R-squared: 0.9394
F-statistic: 32 on 2 and 2 DF, p-value: 0.0303
library(ggplot2)
data("economics")
head(economics)
pce_model<-lm(psavert~pce,data=economics)
summary(pce_model)
Call:
lm(formula = psavert ~ pce, data = economics)
Residuals:
Min 1Q Median 3Q Max
-5.2442 -1.3751 -0.3442 1.3484 7.6090
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.75213073 0.12716708 92.42 <0.0000000000000002 ***
pce -0.00066075 0.00002124 -31.12 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.808 on 572 degrees of freedom
Multiple R-squared: 0.6286, Adjusted R-squared: 0.628
F-statistic: 968.2 on 1 and 572 DF, p-value: < 0.00000000000000022
pop_model<-lm(psavert~pce,data=economics)
summary(pop_model)
Call:
lm(formula = psavert ~ pce, data = economics)
Residuals:
Min 1Q Median 3Q Max
-5.2442 -1.3751 -0.3442 1.3484 7.6090
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.75213073 0.12716708 92.42 <0.0000000000000002 ***
pce -0.00066075 0.00002124 -31.12 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.808 on 572 degrees of freedom
Multiple R-squared: 0.6286, Adjusted R-squared: 0.628
F-statistic: 968.2 on 1 and 572 DF, p-value: < 0.00000000000000022
##unemploy model
unemploy_model<-lm(psavert~pce,data=economics)
summary(unemploy_model)
Call:
lm(formula = psavert ~ pce, data = economics)
Residuals:
Min 1Q Median 3Q Max
-5.2442 -1.3751 -0.3442 1.3484 7.6090
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.75213073 0.12716708 92.42 <0.0000000000000002 ***
pce -0.00066075 0.00002124 -31.12 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.808 on 572 degrees of freedom
Multiple R-squared: 0.6286, Adjusted R-squared: 0.628
F-statistic: 968.2 on 1 and 572 DF, p-value: < 0.00000000000000022
uempmed_model<-lm(psavert~pce,data=economics)
summary(uempmed_model)
Call:
lm(formula = psavert ~ pce, data = economics)
Residuals:
Min 1Q Median 3Q Max
-5.2442 -1.3751 -0.3442 1.3484 7.6090
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.75213073 0.12716708 92.42 <0.0000000000000002 ***
pce -0.00066075 0.00002124 -31.12 <0.0000000000000002 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.808 on 572 degrees of freedom
Multiple R-squared: 0.6286, Adjusted R-squared: 0.628
F-statistic: 968.2 on 1 and 572 DF, p-value: < 0.00000000000000022
overal_model<-lm(psavert~pce+pop+unemploy+uempmed,data=economics)
summary(overal_model)
Call:
lm(formula = psavert ~ pce + pop + unemploy + uempmed, data = economics)
Residuals:
Min 1Q Median 3Q Max
-4.8282 -0.6982 -0.0981 0.6336 5.4816
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 45.67014927 2.23116237 20.469 < 0.0000000000000002 ***
pce 0.00085067 0.00011865 7.170 0.00000000000235 ***
pop -0.00017334 0.00001101 -15.742 < 0.0000000000000002 ***
unemploy 0.00024997 0.00004824 5.182 0.00000030589469 ***
uempmed 0.16599616 0.03559460 4.664 0.00000387763733 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.186 on 569 degrees of freedom
Multiple R-squared: 0.8409, Adjusted R-squared: 0.8398
F-statistic: 752.1 on 4 and 569 DF, p-value: < 0.00000000000000022
library(stargazer)
G3;
Please cite as:
gG3; Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
gG3; R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
g
stargazer(pce_model,pop_model,unemploy_model,uempmed_model,overal_model,
type="text",
title="regression results",
dep.var.labels=c("savings rate"))
Warning :length of NULL cannot be changed
Warning :length of NULL cannot be changed
Warning :length of NULL cannot be changed
Warning :length of NULL cannot be changed
Warning :length of NULL cannot be changed
Warning :number of rows of result is not a multiple of vector length (arg 2)
regression results
================================================================================================================================================
Dependent variable:
----------------------------------------------------------------------------------------------------------------------------
savings rate
(1) (2) (3) (4) (5)
------------------------------------------------------------------------------------------------------------------------------------------------
pce -0.001*** -0.001*** -0.001*** -0.001*** 0.001***
(0.00002) (0.00002) (0.00002) (0.00002) (0.0001)
pop -0.0002***
(0.00001)
unemploy 0.0002***
(0.00005)
uempmed 0.166***
(0.036)
Constant 11.752*** 11.752*** 11.752*** 11.752*** 45.670***
(0.127) (0.127) (0.127) (0.127) (2.231)
------------------------------------------------------------------------------------------------------------------------------------------------
Observations 574 574 574 574 574
R2 0.629 0.629 0.629 0.629 0.841
Adjusted R2 0.628 0.628 0.628 0.628 0.840
Residual Std. Error 1.808 (df = 572) 1.808 (df = 572) 1.808 (df = 572) 1.808 (df = 572) 1.186 (df = 569)
F Statistic 968.195*** (df = 1; 572) 968.195*** (df = 1; 572) 968.195*** (df = 1; 572) 968.195*** (df = 1; 572) 752.085*** (df = 4; 569)
================================================================================================================================================
Note: *p<0.1; **p<0.05; ***p<0.01
helps evaluate whether a linear regression model meets the key assumptions required for reliable inference.
library(car)
G3;Loading required package: carData
g
vif_values<-vif(overal_model)
print(vif_values)
pce pop unemploy uempmed
72.508990 66.420421 6.613911 8.699492
library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ lubridate 1.9.4 ✔ tibble 3.2.1
✔ purrr 1.0.4 ✔ tidyr 1.3.1
── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks plotly::filter(), stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::recode() masks car::recode()
✖ purrr::some() masks car::some()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(corrplot)
G3;corrplot 0.95 loaded
g
cor_matrix<-cor(select(economics, where(is.numeric)),use="complete.obs")
corrplot(cor_matrix,
method="color",
type="upper",
tl.col="black",
tl.srt=45,
addCoef.col="white",
number.cex=0.7,
diag=TRUE)
should have constant variance
library(forecast)
G3;Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
g
checkresiduals(overal_model)
Breusch-Godfrey test for serial correlation of order up to 10
data: Residuals
LM test = 379.92, df = 10, p-value < 0.00000000000000022
library(lmtest)
G3;Loading required package: zoo
gG3;
Attaching package: ‘zoo’
gG3;The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
g
library(car)
dwtest(overal_model)
Durbin-Watson test
data: overal_model
DW = 0.4103, p-value < 0.00000000000000022
alternative hypothesis: true autocorrelation is greater than 0
autocorrelation exists
library(lmtest)
bptest(overal_model)
studentized Breusch-Pagan test
data: overal_model
BP = 50.022, df = 4, p-value = 0.0000000003573
non-constant variance(hetero…..)
#positive correlation
library(ggplot2)
ggplot(economics, aes(x = pop, y = psavert)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Positive Correlation: Population vs. psaverting",
x = "U.S. Population",
y = "Unemployment"
) +
theme_gray()
#negative correlation
ggplot(economics, aes(x = pop, y = unemploy)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Negative Correlation: Population vs. Unemployment",
x = "U.S. Population",
y = "Unemployment") +
theme_minimal()
#no correlation
ggplot(economics, aes(x = pop, y = pce)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "No Correlation: Population vs. Personal Consumption Expenditures",
x = "U.S. Population",
y = "Personal Consumption Expenditures") +
theme_minimal()
#perfect correlation
ggplot(economics, aes(x = pop, y = pop)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Perfect Correlation: Population vs. Population",
x = "U.S. Population",
y = "Population") +
theme_minimal()
#weak correlation
ggplot(economics, aes(x = pop, y = psavert)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Weak Correlation: Population vs. psaverting",
x = "U.S. Population",
y = "Unemployment") +
theme_minimal()
#moderate correlation
ggplot(economics, aes(x = pop, y = unemploy)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Moderate Correlation: Population vs. Unemployment",
x = "U.S. Population",
y = "Unemployment"
) +
theme_minimal()
#strong correlation
ggplot(economics, aes(x = pop, y = pce)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Strong Correlation: Population vs. Personal Consumption Expenditures",
x = "U.S. Population",
y = "Personal Consumption Expenditures"
) +
theme_minimal()
#very strong correlation
ggplot(economics, aes(x = pop, y = pop)) +
geom_point(color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Very Strong Correlation: Population vs. Population",
x = "U.S. Population",
y = "Population"
) +
theme_minimal()
#causation vs correlation
#Basic Correlation Analysis # Calculate correlation matrix
corr_matrix <- cor(economics[, c("pce", "pop", "psavert", "unemploy")], use = "pairwise.complete.obs")
corr_matrix
pce pop psavert unemploy
pce 1.0000000 0.9872421 -0.7928546 0.6145176
pop 0.9872421 1.0000000 -0.8363147 0.6337165
psavert -0.7928546 -0.8363147 1.0000000 -0.3093769
unemploy 0.6145176 0.6337165 -0.3093769 1.0000000
library(corrplot)
corrplot(corr_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45)
##simulating Correlation Without Causation
##sample data
x=c(10,20,30,40,50)
y=c(15,25,35,45,55)
z=c(50,40,30,20,10)
##Create a data frame
data <- data.frame(x, y, z)
##correlation matrix
cor_matrix <- cor(data)
print(cor_matrix)
x y z
x 1 1 -1
y 1 1 -1
z -1 -1 1
interpretation of correlation coefficient
interprete_correlation <- function(corr) {
if (corr > 0.8) {
return("Very Strong Positive Correlation")
} else if (corr > 0.6) {
return("Strong Positive Correlation")
} else if (corr > 0.4) {
return("Moderate Positive Correlation")
} else if (corr > 0.2) {
return("Weak Positive Correlation")
} else if (corr < -0.2) {
return("Weak Negative Correlation")
} else if (corr < -0.4) {
return("Moderate Negative Correlation")
} else if (corr < -0.6) {
return("Strong Negative Correlation")
} else if (corr < -0.8) {
return("Very Strong Negative Correlation")
} else {
return("No Correlation")
}
}
r_value <- cor(data$x, data$y)
cat("correlation between x and y is, r_value", ":",)
G1;H1;Errorh in cat("correlation between x and y is, r_value", ":", ) :
argument is missing, with no default
gG1;H1;Error during wrapuph: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
interprete_correlation(r_value)
[1] "Very Strong Positive Correlation"