library(ggplot2)
library(statsr)
library(gridExtra)
library(lubridate)
rm(list=ls())
dev.off
## function (which = dev.cur())
## {
## if (which == 1)
## stop("cannot shut down device 1 (the null device)")
## .External(C_devoff, as.integer(which))
## dev.cur()
## }
## <bytecode: 0x7ff8a2f795f0>
## <environment: namespace:grDevices>
cat("\014")
# Loading the data
TRAFFIC <- read.csv("~/Downloads/Datasets/TRAFFIC.csv")
head(TRAFFIC)
## TUNNEL DAY VOLUME_PER_MINUTE
## 1 Holland weekday 3
## 2 Holland weekday 57
## 3 Holland weekday 88
## 4 Holland weekday 61
## 5 Holland weekday 64
## 6 Holland weekday 95
dim(TRAFFIC)
## [1] 2801 3
table(TRAFFIC$TUNNEL)
##
## Holland Lincoln
## 1401 1400
We can see that the dataset provided consists of 2810 rows and 3 column variables.
str(TRAFFIC)
## 'data.frame': 2801 obs. of 3 variables:
## $ TUNNEL : chr "Holland" "Holland" "Holland" "Holland" ...
## $ DAY : chr "weekday" "weekday" "weekday" "weekday" ...
## $ VOLUME_PER_MINUTE: int 3 57 88 61 64 95 13 81 101 68 ...
We can see that all the columns TUNNEL and DAY are of character type but are categorical varibales and the attribute VOLUME_PER_MINUTE is a numerical variables
summary(TRAFFIC)
## TUNNEL DAY VOLUME_PER_MINUTE
## Length:2801 Length:2801 Min. :-22.00
## Class :character Class :character 1st Qu.: 53.00
## Mode :character Mode :character Median : 64.00
## Mean : 62.89
## 3rd Qu.: 74.00
## Max. :137.00
A basic function that returns the mean, medaian and standard deviation of a variable
explore<-function(x){
data<-c("Mean"=mean(x, na.rm=T),
"Median"=median(x, na.rm =T),
"Standard Deviation" = sd(x, na.rm =T),
"Length" = length(x))
return(data)
}
explore(TRAFFIC$VOLUME_PER_MINUTE)
## Mean Median Standard Deviation Length
## 62.88790 64.00000 15.85356 2801.00000
You can also embed plots, for example:
gp1 <- ggplot(data = TRAFFIC,
aes(x = VOLUME_PER_MINUTE, fill= as.factor(TUNNEL)))+
geom_histogram(
aes(y=..density..),
colour = "black",
fill = "thistle",
binwidth = 3) +
geom_density(alpha=.2)
gp1<-gp1+ scale_fill_discrete(name = "Tunnel Type")
gp2 <- ggplot(data = TRAFFIC,
aes(x = VOLUME_PER_MINUTE, fill= as.factor(DAY)))+
geom_histogram(
aes(y=..density..),
colour = "black",
fill = "thistle",
binwidth = 5) +
geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Day Type")
grid.arrange(gp1, gp2, nrow = 2, ncol =1)
We can see that the distribution of VOLUME_PER_MINUTE is very similar to a Normal Distribution
P-Value is the probability of getting the observed value of the test statistic, if the null hypothesis is actually true.
The smaller the P-Value, stronger is the evidence that we can reject the NULL hypothesis.
P-Values range from 0 to 1.
If P-Value is greater than an acceptable threshold then we cannot reject the NULL hypothesis because then we have a strong evidence in favor of it. (Most of the times threshold will be 0.05, which is equal to 5%).
Lets calculate the P-Value using Z-test.
lincoln <-subset(TRAFFIC,TRAFFIC$TUNNEL=="Lincoln")
holland <-subset(TRAFFIC,TRAFFIC$TUNNEL=="Holland")
lincoln_explore <- explore(lincoln$VOLUME_PER_MINUTE)
holland_explore <- explore(holland$VOLUME_PER_MINUTE)
sd_lin_hol <- sqrt(lincoln_explore["Standard Deviation"]^2/lincoln_explore["Length"] +holland_explore["Standard Deviation"]^2/holland_explore["Length"])
z_score <- (lincoln_explore[1]-holland_explore[1])/sd_lin_hol
z_score
## Mean
## 20.52819
1-pnorm(z_score)
## Mean
## 0
plot(x=seq(-22,22, by=0.1),y=dnorm(seq(-22,22,by=0.1),mean=0),type='l',xlab = 'mean difference', ylab='possibility')
abline(v=z_score, col='thistle')
## We found great evidence to support our hypothesis that traffic in the Lincoln tunnel is higher than in the Holland Tunnel.
indiv_happiness <- read.csv("~/Downloads/Datasets/indiv_happiness.csv")
#Understanding the data
dim(indiv_happiness)
## [1] 30000 5
We can see that the dataset provided consists of 30000 rows and 5 column variables.
str(indiv_happiness)
## 'data.frame': 30000 obs. of 5 variables:
## $ id : int 111245 13417 391011 143824 512471 337602 503479 205103 487386 178342 ...
## $ country : chr "Australia" "Philippines" "Belgium" "Liberia" ...
## $ continent: chr "Australia" "Asia" "Europe" "Africa" ...
## $ happiness: num 7.41 5.33 7.03 4.08 4.49 5.31 4 5.56 7.35 6.16 ...
## $ immigrant: int 0 0 0 0 0 0 0 0 0 0 ...
summary(indiv_happiness)
## id country continent happiness
## Min. : 11 Length:30000 Length:30000 Min. :2.500
## 1st Qu.:151876 Class :character Class :character 1st Qu.:4.560
## Median :299975 Mode :character Mode :character Median :5.290
## Mean :299842 Mean :5.419
## 3rd Qu.:447960 3rd Qu.:6.240
## Max. :599995 Max. :8.100
## immigrant
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.1224
## 3rd Qu.:0.0000
## Max. :1.0000
explore(indiv_happiness$happiness)
## Mean Median Standard Deviation Length
## 5.419064 5.290000 1.112081 30000.000000
You can also embed plots, for example:
gp1 <- ggplot(data = indiv_happiness,
aes(x = happiness))+
geom_histogram(
aes(y=..density..),
colour = "black",
fill = "thistle",
binwidth = 0.3) +
geom_density(alpha=.2)
gp1<-gp1+ scale_fill_discrete(name = "Tunnel Type")
gp2 <- ggplot(data = indiv_happiness,
aes(x = happiness, fill= as.factor(continent)))+
geom_histogram(
aes(y=..density..),
colour = "black",
fill = "thistle",
binwidth = 0.5) +
geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Continent")
grid.arrange(gp1, gp2, nrow = 2, ncol =1)
We can see that the distribution of happiness is very similar to a Normal Distribution
Lets get an intuitive idea using a plot
gp2 <- ggplot(data = indiv_happiness[indiv_happiness$continent %in% c("South America","Europe"),],
aes(x = happiness, fill= as.factor(continent)))+
geom_histogram(
aes(y=..density..),
colour = "black",
fill = "thistle",
binwidth = 0.1) +
geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Continent")
gp2
Lets calculate the P-Value using Z-test.
Step - 1: Subset the data
south_america <- indiv_happiness[indiv_happiness$continent=="South America",]
europe<-indiv_happiness[indiv_happiness$continent=="Europe",]
Step-2: Calculate the z-score For the ease of calculating z_score we can define a single function
zscore<-function(x,y){
z<-(mean(x)-mean(y))/(sqrt(sd(x)^2/length(x) +sd(y)^2/length(y)))
return(z)
}
z_score<-zscore(south_america$happiness,europe$happiness)
z_score
## [1] 0.169798
1-pnorm(z_score)
## [1] 0.4325845
plot(x=seq(-22,22, by=0.1),y=dnorm(seq(-22,22,by=0.1),mean=0),type='l',xlab = 'mean difference', ylab='possibility')
abline(v=z_score, col='thistle')
## Not Significant evidence to reject the Null Hypothesis
Lets get an intuitive idea using a plot
gp2 <- ggplot(data = indiv_happiness[indiv_happiness$continent %in% c("South America","North America"),],
aes(x = happiness, fill= as.factor(continent)))+
geom_histogram(
aes(y=..density..),
colour = "black",
fill = "thistle",
binwidth = 0.1) +
geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Continent")
gp2
z_score<-zscore(indiv_happiness[as.character(indiv_happiness$continent)=="North America",]$happiness,
south_america$happiness )
z_score
## [1] 1.869066
plot(x=seq(-3,3, by=0.1),y=dnorm(seq(-3,3,by=0.1),mean=0),type='l',xlab = 'mean difference', ylab='possibility')
abline(v=z_score, col='thistle')
1-pnorm(z_score)
## [1] 0.03080681