P-Values and Z-Scores

Load Packages

library(ggplot2)
library(statsr)
library(gridExtra)
library(lubridate)

Let’s clean the global environment before moving further

rm(list=ls())
dev.off
## function (which = dev.cur()) 
## {
##     if (which == 1) 
##         stop("cannot shut down device 1 (the null device)")
##     .External(C_devoff, as.integer(which))
##     dev.cur()
## }
## <bytecode: 0x7ff8a2f795f0>
## <environment: namespace:grDevices>
cat("\014")

# Loading the data

TRAFFIC <- read.csv("~/Downloads/Datasets/TRAFFIC.csv")
head(TRAFFIC)
##    TUNNEL     DAY VOLUME_PER_MINUTE
## 1 Holland weekday                 3
## 2 Holland weekday                57
## 3 Holland weekday                88
## 4 Holland weekday                61
## 5 Holland weekday                64
## 6 Holland weekday                95

Understanding the data

dim(TRAFFIC)
## [1] 2801    3
table(TRAFFIC$TUNNEL)
## 
## Holland Lincoln 
##    1401    1400

We can see that the dataset provided consists of 2810 rows and 3 column variables.

str(TRAFFIC)
## 'data.frame':    2801 obs. of  3 variables:
##  $ TUNNEL           : chr  "Holland" "Holland" "Holland" "Holland" ...
##  $ DAY              : chr  "weekday" "weekday" "weekday" "weekday" ...
##  $ VOLUME_PER_MINUTE: int  3 57 88 61 64 95 13 81 101 68 ...

We can see that all the columns TUNNEL and DAY are of character type but are categorical varibales and the attribute VOLUME_PER_MINUTE is a numerical variables

summary(TRAFFIC)
##     TUNNEL              DAY            VOLUME_PER_MINUTE
##  Length:2801        Length:2801        Min.   :-22.00   
##  Class :character   Class :character   1st Qu.: 53.00   
##  Mode  :character   Mode  :character   Median : 64.00   
##                                        Mean   : 62.89   
##                                        3rd Qu.: 74.00   
##                                        Max.   :137.00

Adding Basic Functionalities

A basic function that returns the mean, medaian and standard deviation of a variable

explore<-function(x){
  data<-c("Mean"=mean(x, na.rm=T),
          "Median"=median(x, na.rm =T), 
          "Standard Deviation" = sd(x, na.rm =T),
          "Length" = length(x))
  return(data)
}
explore(TRAFFIC$VOLUME_PER_MINUTE)
##               Mean             Median Standard Deviation             Length 
##           62.88790           64.00000           15.85356         2801.00000

Including Plots

You can also embed plots, for example:

gp1 <- ggplot(data = TRAFFIC, 
              aes(x = VOLUME_PER_MINUTE, fill= as.factor(TUNNEL)))+ 
               geom_histogram(
               aes(y=..density..), 
               colour = "black", 
               fill = "thistle",
               binwidth = 3) + 
               geom_density(alpha=.2)
gp1<-gp1+ scale_fill_discrete(name = "Tunnel Type")
gp2 <- ggplot(data = TRAFFIC, 
              aes(x = VOLUME_PER_MINUTE, fill= as.factor(DAY)))+ 
               geom_histogram(
               aes(y=..density..), 
               colour = "black", 
               fill = "thistle",
               binwidth = 5) + 
               geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Day Type")

grid.arrange(gp1, gp2, nrow = 2, ncol =1)

We can see that the distribution of VOLUME_PER_MINUTE is very similar to a Normal Distribution

The definition of P-Value and Main Idea

P-Value is the probability of getting the observed value of the test statistic, if the null hypothesis is actually true.

The smaller the P-Value, stronger is the evidence that we can reject the NULL hypothesis.

P-Values range from 0 to 1.

If P-Value is greater than an acceptable threshold then we cannot reject the NULL hypothesis because then we have a strong evidence in favor of it. (Most of the times threshold will be 0.05, which is equal to 5%).

Our Hypothesis: Traffic in Lincoln is higher than traffic in Holland tunnel

Null Hypothesis(H0): Traffic in Lincoln is same as Traffic in Holland tunnel.

Lets calculate the P-Value using Z-test.

Step - 1 Subset the data

lincoln <-subset(TRAFFIC,TRAFFIC$TUNNEL=="Lincoln")
holland <-subset(TRAFFIC,TRAFFIC$TUNNEL=="Holland")

Let’s calculate the Z-Score

lincoln_explore <- explore(lincoln$VOLUME_PER_MINUTE)
holland_explore <- explore(holland$VOLUME_PER_MINUTE)
sd_lin_hol <- sqrt(lincoln_explore["Standard Deviation"]^2/lincoln_explore["Length"] +holland_explore["Standard Deviation"]^2/holland_explore["Length"])
z_score <- (lincoln_explore[1]-holland_explore[1])/sd_lin_hol
z_score
##     Mean 
## 20.52819
1-pnorm(z_score) 
## Mean 
##    0

the p-value is so small, it returns 0. It is almost impossible for that result (the difference in means between Holland and Lincoln tunnel) to happen by chance

plot(x=seq(-22,22, by=0.1),y=dnorm(seq(-22,22,by=0.1),mean=0),type='l',xlab = 'mean difference',  ylab='possibility')
abline(v=z_score, col='thistle')

## We found great evidence to support our hypothesis that traffic in the Lincoln tunnel is higher than in the Holland Tunnel.

Let’s explore another dataset

Load the dataset

indiv_happiness <- read.csv("~/Downloads/Datasets/indiv_happiness.csv")

#Understanding the data

dim(indiv_happiness)
## [1] 30000     5

We can see that the dataset provided consists of 30000 rows and 5 column variables.

str(indiv_happiness)
## 'data.frame':    30000 obs. of  5 variables:
##  $ id       : int  111245 13417 391011 143824 512471 337602 503479 205103 487386 178342 ...
##  $ country  : chr  "Australia" "Philippines" "Belgium" "Liberia" ...
##  $ continent: chr  "Australia" "Asia" "Europe" "Africa" ...
##  $ happiness: num  7.41 5.33 7.03 4.08 4.49 5.31 4 5.56 7.35 6.16 ...
##  $ immigrant: int  0 0 0 0 0 0 0 0 0 0 ...
summary(indiv_happiness)
##        id           country           continent           happiness    
##  Min.   :    11   Length:30000       Length:30000       Min.   :2.500  
##  1st Qu.:151876   Class :character   Class :character   1st Qu.:4.560  
##  Median :299975   Mode  :character   Mode  :character   Median :5.290  
##  Mean   :299842                                         Mean   :5.419  
##  3rd Qu.:447960                                         3rd Qu.:6.240  
##  Max.   :599995                                         Max.   :8.100  
##    immigrant     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1224  
##  3rd Qu.:0.0000  
##  Max.   :1.0000
explore(indiv_happiness$happiness)
##               Mean             Median Standard Deviation             Length 
##           5.419064           5.290000           1.112081       30000.000000

Including Plots

You can also embed plots, for example:

gp1 <- ggplot(data = indiv_happiness, 
              aes(x = happiness))+ 
               geom_histogram(
               aes(y=..density..), 
               colour = "black", 
               fill = "thistle",
               binwidth = 0.3) + 
               geom_density(alpha=.2)
gp1<-gp1+ scale_fill_discrete(name = "Tunnel Type")
gp2 <- ggplot(data = indiv_happiness, 
              aes(x = happiness, fill= as.factor(continent)))+ 
               geom_histogram(
               aes(y=..density..), 
               colour = "black", 
               fill = "thistle",
               binwidth = 0.5) + 
               geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Continent")

grid.arrange(gp1, gp2, nrow = 2, ncol =1)

We can see that the distribution of happiness is very similar to a Normal Distribution

Null Hypothesis(H0):People are not happier in South America than in Europe

Our Hypothesis/Alternate Hypothesis(H1):People in South America happier than people in Europe

Lets get an intuitive idea using a plot

gp2 <- ggplot(data = indiv_happiness[indiv_happiness$continent %in% c("South America","Europe"),], 
              aes(x = happiness, fill= as.factor(continent)))+ 
               geom_histogram(
               aes(y=..density..), 
               colour = "black", 
               fill = "thistle",
               binwidth = 0.1) + 
               geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Continent")
gp2

Lets calculate the P-Value using Z-test.

Step - 1: Subset the data

south_america <- indiv_happiness[indiv_happiness$continent=="South America",]
europe<-indiv_happiness[indiv_happiness$continent=="Europe",]

Step-2: Calculate the z-score For the ease of calculating z_score we can define a single function

zscore<-function(x,y){
  z<-(mean(x)-mean(y))/(sqrt(sd(x)^2/length(x) +sd(y)^2/length(y)))
  return(z)
}
z_score<-zscore(south_america$happiness,europe$happiness)
z_score
## [1] 0.169798
1-pnorm(z_score)
## [1] 0.4325845

We have a p-value of 0.4325.

We don’t have enough evidence to reject the Null Hypothesis

plot(x=seq(-22,22, by=0.1),y=dnorm(seq(-22,22,by=0.1),mean=0),type='l',xlab = 'mean difference',  ylab='possibility')
abline(v=z_score, col='thistle')

## Not Significant evidence to reject the Null Hypothesis

Are people in North America happier than people in South America?

Lets get an intuitive idea using a plot

gp2 <- ggplot(data = indiv_happiness[indiv_happiness$continent %in% c("South America","North America"),], 
              aes(x = happiness, fill= as.factor(continent)))+ 
               geom_histogram(
               aes(y=..density..), 
               colour = "black", 
               fill = "thistle",
               binwidth = 0.1) + 
               geom_density(alpha=.2)
gp2<-gp2+ scale_fill_discrete(name = "Continent")
gp2

z_score<-zscore(indiv_happiness[as.character(indiv_happiness$continent)=="North America",]$happiness,
               south_america$happiness )
z_score
## [1] 1.869066
plot(x=seq(-3,3, by=0.1),y=dnorm(seq(-3,3,by=0.1),mean=0),type='l',xlab = 'mean difference',  ylab='possibility')
abline(v=z_score, col='thistle')

1-pnorm(z_score)
## [1] 0.03080681

P=3.08%, we have evidence to reject the NULL hypothesis

People are happier in NA than in SA