Part I - Basic Stats and Probability

This first part of the assignment will be focused on the basic stats and probability concepts introduced in class. Please describe and show all your calculations in the R file (you can use comments to explain the methodology being applied).

Problem Description: A worldwide health organization collected some data regarding stroke from different hospitals around the world. The organization wants to calculate some probabilities based on the data gathered.

Q1: Loading the data and creating a contingency table (20 points)

# install package ISwR
#install.packages("ISwR")
# import package ISwR
library(ISwR)
# view data 'stroke'
stroke
# creating a contingency table using table()
temp <- table(stroke$sex,
              stroke$dead)
temp
##         
##          FALSE TRUE
##   Female   189  321
##   Male     155  164

Q2: Calculating Conditional Probabilities (30 points)

# create a prop table to get percentage probabilities of events in table temp
prob_temp <- prop.table(temp,1)
prob_temp
##         
##              FALSE      TRUE
##   Female 0.3705882 0.6294118
##   Male   0.4858934 0.5141066
# print result
print(paste0("Probability that a person will die from a stroke knowing that this person is a male = ", prob_temp[2,2]))
## [1] "Probability that a person will die from a stroke knowing that this person is a male = 0.5141065830721"
# create contingency table for variables dead and diab 
temp2 <- table(stroke$dead,
               stroke$diab)
temp2
##        
##          No Yes
##   FALSE 308  35
##   TRUE  414  62
# create a prop table to calculate probability
prob_temp2 <- prop.table(temp2,2)
prob_temp2
##        
##                No       Yes
##   FALSE 0.4265928 0.3608247
##   TRUE  0.5734072 0.6391753
print(paste0("The probability of a person to die from a stroke knowing that this person had diabetes = ", prob_temp2[2,2]))
## [1] "The probability of a person to die from a stroke knowing that this person had diabetes = 0.639175257731959"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# create a subset of the data stroke for variable dead=TRUE
stroke_subset <- stroke%>%
                filter(dead==TRUE)%>%
                dplyr::select(sex, coma)

stroke_subset
# create contingency table table for the subset
temp3 <- table(stroke_subset)
temp3
##         coma
## sex       No Yes
##   Female 269  50
##   Male   133  29
# calculate probability using prop table
prob_temp3 <- prop.table(temp3,2)
prob_temp3
##         coma
## sex             No       Yes
##   Female 0.6691542 0.6329114
##   Male   0.3308458 0.3670886
print(paste0("Gender (male or female) that is more likely to die from a stroke in case the person is in a coma is: Female"))
## [1] "Gender (male or female) that is more likely to die from a stroke in case the person is in a coma is: Female"

Part II - Probability Distributions The second part of the assignment will be focused on Probability Distributions. You will be requested to identify the distribution that variables follow and calculate probabilities out of that

Problem Description: In this part of the assignment, you will be assigned different datasets for each question so that you can identify the distribution and that you can be exposed to the majority of the distributions studied in class.

Q3: Probability distribution exercises (60 points)

# given:
meanlb <- 5
lambdalb <- 1/5
x = 7

# probability that the lifetime of a light bulb is more than 7 years:
a1 <- 1 - pexp(7,lambdalb)
print(paste0("Probability that the lifetime of a light bulb is more than 7 years = ", a1))
## [1] "Probability that the lifetime of a light bulb is more than 7 years = 0.246596963941606"
print("Distribution of the variable of the lifetime of a bulb is Exponential Distribution")
## [1] "Distribution of the variable of the lifetime of a bulb is Exponential Distribution"
# probability that the lifetime of a light bulb is more than 6 years:
a2 <- 1 - pexp(6,lambdalb)
print(paste0("Probability that the lifetime of a light bulb is more than 6 years = ", a2))
## [1] "Probability that the lifetime of a light bulb is more than 6 years = 0.301194211912202"
# P(x>8) | P(x>6) = P(x>8 and x>6) / P(x>6) = P(x>8) / P(x>6)
a3 <- (1-pexp(8,lambdalb))/(1-pexp(6,lambdalb))
print(paste0("The probability that after lasting for 6 years, the light bulb will have a remaining lifetime of more than 2 years = ",a3))
## [1] "The probability that after lasting for 6 years, the light bulb will have a remaining lifetime of more than 2 years = 0.670320046035639"
ProbTable <- data.frame(Months=c(3,4,5,6,7,8,9),
                            Probabilities=c(0.20,0.20,0.10,0.15,0.10,0.15,0.10))
ProbTable
cat(paste0("Sum of probabilities = ",sum(ProbTable$Probabilities)))
## Sum of probabilities = 1
cat("\nSum of probabilities is 1, hence the events are mutually exclusive")
## 
## Sum of probabilities is 1, hence the events are mutually exclusive
# (P<=5) <- P(3) + P(4) 
plt5 <- sum(ProbTable$Probabilities[which(ProbTable$Months<5)])
cat(paste0("\nProbability of having the project completed in less than 5 months = ",plt5))
## 
## Probability of having the project completed in less than 5 months = 0.4
# calculate expected values for the given probabilities
expectedValues <- data.frame(expVal = ProbTable$Months*ProbTable$Probabilities)
expectedValues
# add the expected values as a new clumn to the probability table
ProbTable <- cbind(ProbTable,expectedValues)
expValDayToComp <- sum(ProbTable$expVal)
# mean = 5.6 months
# total cost = $(100000 + 5.6*2000)

# given:
fixedCost <- 100000
addCost <- 2000

totalCost <- fixedCost + expValDayToComp*addCost

print(paste0("The expected cost of the project = $",totalCost))
## [1] "The expected cost of the project = $111200"
# given:
minss <- 10.7
maxss <- 14.7

print("The screen size follows a uniform distribution")
## [1] "The screen size follows a uniform distribution"
# subtract probability of screen size being less than or equal to 11.1 inches from the propability of screen size being less than or equal to 12.5 inches
p1 <- punif(12.5,min = minss, max = maxss)-punif(11.1, min = minss, max = maxss)

print(paste0("Probability that the notwbook screen size is between 11.1 and 12.5 inches = ",p1))
## [1] "Probability that the notwbook screen size is between 11.1 and 12.5 inches = 0.35"
cat("The probability of a continuous uniform distribution at any given specific point is 0.\nTo get the probability of the screen size being exactly 12.6 inches will be 0")
## The probability of a continuous uniform distribution at any given specific point is 0.
## To get the probability of the screen size being exactly 12.6 inches will be 0

Q4: Probability using the real estate data (from Assignment 1) (30 points)

# check working directory
getwd()
## [1] "C:/Users/mws16/OneDrive/Desktop/OPIM-5603-Statistics in Business Analytics/Homework 2"
# import csv files
townList <- read.csv("List_of_Towns.csv")
realEstate <- read.csv("Real_Estate_Sales_2014-2016.csv")

# merge both datasets on the shared key 'Town'
myData <- merge(x = realEstate, 
                y = townList[ , c('Town','County')], 
                by.x = 'Town', 
                by.y = 'Town')

# print merged data
myData
library(dplyr)

# create a subset of the data containing only County and PropertyType
pType <- myData%>%
          dplyr::select(County, PropertyType)
pType
options(scipen = 5)    

# create prop table to calculate probabilities
pt <- prop.table(table(pType),1)

pt
##                    PropertyType
## County                 Apartments    Commercial    Industrial
##   Fairfield County  0.00475398763 0.04174676573 0.00426040422
##   Hartford County   0.00963508500 0.03031729332 0.00451298522
##   Litchfield County 0.00125746621 0.03279891020 0.00345803207
##   Middlesex County  0.00038119441 0.02858958069 0.00304955527
##   New Haven County  0.00585071247 0.03353150263 0.00412066308
##   New London County 0.00298075285 0.03628002044 0.00272525975
##   Tolland County    0.00324412003 0.03325223033 0.00223033252
##   Windham County    0.00745861379 0.03347280335 0.00345643078
##                    PropertyType
## County              Public Utility   Residential   Vacant Land
##   Fairfield County   0.00012989037 0.92455967164 0.02454928041
##   Hartford County    0.00005537405 0.92856747328 0.02691178914
##   Litchfield County  0.00020957770 0.88661846380 0.07565755004
##   Middlesex County   0.00038119441 0.90952986023 0.05806861499
##   New Haven County   0.00003145544 0.93174168790 0.02472397848
##   New London County  0.00000000000 0.89788792369 0.06012604326
##   Tolland County     0.00000000000 0.89517437145 0.06609894566
##   Windham County     0.00000000000 0.86665453884 0.08895761324
#rownames(pt)
#colnames(pt)

# select the probability calculated for Vacant Land in Fairfield County 
ptvValue <- pt[which(rownames(pt)=="Fairfield County"),
                which(colnames(pt)=="Vacant Land")]

print(paste0("The probability of Property type being vacant land in Fairfield County = ",ptvValue))
## [1] "The probability of Property type being vacant land in Fairfield County = 0.0245492804073362"
# select the probability calculated for Apartments in Hartford County 
ptaValue <- pt[which(rownames(pt)=="Hartford County"),which(colnames(pt)=="Apartments")]

print(paste0("The probability of Property type being Apartment in Hartford County = ",ptaValue))
## [1] "The probability of Property type being Apartment in Hartford County = 0.00963508499916939"

Q5: Distribution of a variable - Assessed Value (from Assignment 1) (20 points)

# Remove outliers from AssessedValue
outlier_values <- boxplot.stats(myData$AssessedValue)$out

# To not lose the original data, store the data without outliers in a new variable
treated_myData = myData[-which(myData$AssessedValue %in% outlier_values),] 

# calculate statistical parameters of AssessedValue
meanAV <- mean(treated_myData$AssessedValue)
medianAV <- median(treated_myData$AssessedValue)
sdAV <- sd(treated_myData$AssessedValue)

realP <- c(paste0("Mean=",meanAV), paste0("Median=",medianAV), paste0("Standard deviation=",sdAV))

# set option to display numbers as integers and not in exponential form
options(scipen = 5)    

# Plot Kernel Density of AssessedValue to check the curve
plot(density(treated_myData$AssessedValue),
     main = "Kernel Density Plot for AssessedValue",
     cex.main=1.1)

cat("The density plot resembles the bell curve of a normal distribution\n")
## The density plot resembles the bell curve of a normal distribution
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# check maximum likelihood of AssessedValue following a normal distribution
fit <- fitdistr(treated_myData$AssessedValue, densfun = "normal")
fit
##       mean           sd     
##   162393.1670    90599.3105 
##  (   249.1768) (   176.1946)
cat("\nComparing the parameters of the variable AssessedValue and the fitted normal distribution:\n")
## 
## Comparing the parameters of the variable AssessedValue and the fitted normal distribution:
realP
## [1] "Mean=162393.16701084"               
## [2] "Median=143600"                      
## [3] "Standard deviation=90599.6531682618"
fit
##       mean           sd     
##   162393.1670    90599.3105 
##  (   249.1768) (   176.1946)
cat("\nIt can be seen that the plot resembles a normal distribution.\nThe real parameters match with the fitted normal distribution parameters.\nThe mean and median seem to be pretty close considering the range of the variable.\nAll these observations indicate that the variable Assessed value can be said to have a Normal Distribution.")
## 
## It can be seen that the plot resembles a normal distribution.
## The real parameters match with the fitted normal distribution parameters.
## The mean and median seem to be pretty close considering the range of the variable.
## All these observations indicate that the variable Assessed value can be said to have a Normal Distribution.

Part III - Inference Statistics The third part of the assignment will be focused on Inference Statistics. You will be requested to calculate some confidence intervals, make decisions based on this calculation and also do some hypothesis testing. Also, the concept of the Central Limit Theorem will be assessed in this part of the assignment.

Problem Description: In this part of the assignment, you will do some contingency tables similar to Assignment1 and do hypothesis testing.

Q6: Central Limit Theorem (20 points)

cat("The Central Limit Theorem states that if the sample size is large enough (greater than 30), any independent random variable will be normal or nearly normal.\nThe number of boxes here i.e. the sample size is 49. \nAs it is large enough, distribution followed by the variable weight of the boxes of this type can be treated as a Normal Distribution.")
## The Central Limit Theorem states that if the sample size is large enough (greater than 30), any independent random variable will be normal or nearly normal.
## The number of boxes here i.e. the sample size is 49. 
## As it is large enough, distribution followed by the variable weight of the boxes of this type can be treated as a Normal Distribution.
# Here 
# claimed mean = 205
# sample size = n = 49
# sd = 15
sm <- 205
n <- 49
sdv <- 15

# for the load to be transported safely, xbar will be the ideal required mean weight
xbar <- 9800/n

# calculating standard error
se <- sdv/sqrt(n)

# calculating z-score
z <- (xbar - sm)/se

# calculating the probability from the z-score
prob <- pnorm(z)

cat(paste0("\nThe probability that all 49 boxes can be safely loaded and transported = ",prob))
## 
## The probability that all 49 boxes can be safely loaded and transported = 0.00981532862864534
alpha <- 0.05
# for upperbound
ub <- qnorm((1-(alpha/2)),mean = sm, sd = se)
# for lowerbound
lb <- qnorm((0+(alpha/2)), mean = sm, sd = se)

cat(paste0("Interval covering 95% of the distribution:\n",lb," to ", ub))
## Interval covering 95% of the distribution:
## 200.800077175986 to 209.199922824014

Q7: Hypothesis Testing using the real estate data (from Assignment 1) (20 points)

library(dplyr)

# frequency table using dplyr
tableCounty <- myData%>%
      group_by(County)%>%
      summarise(freq = n())
# subset for Fairfield County
library(dplyr)
tableFC <- myData%>%
            filter(County=="Fairfield County")%>%
            group_by(Town)%>%
            summarise(freq=n())
          
nFC <- nrow(tableFC)            

# subset for Hartford County
tableHC <- myData%>%
            filter(County=="Hartford County")%>%
            group_by(Town)%>%
            summarise(freq=n())
          
nHC <- nrow(tableHC)   


# calculate proportions for town in Fairfield County
tableFCProp <- tableFC%>%
                mutate(prop = freq/sum(freq))

# calculate proportions for town in Hartford County
tableHCProp <- tableHC%>%
                mutate(prop = freq/sum(freq))

# Null hypothesis
print("H0 <- avgHC>=avgFC")
## [1] "H0 <- avgHC>=avgFC"
# Alternative hypothesis
print("H1 <- avgHC<avgFC")
## [1] "H1 <- avgHC<avgFC"
# significance level
alpha=0.05

# Perform t-test as the sample size is small
t.test(tableHCProp$prop, alternative = "less", mu = mean(tableFCProp$prop), conf.level = 1-alpha)
## 
##  One Sample t-test
## 
## data:  tableHCProp$prop
## t = -2.4556, df = 28, p-value = 0.01027
## alternative hypothesis: true mean is less than 0.04347826
## 95 percent confidence interval:
##        -Inf 0.04071436
## sample estimates:
##  mean of x 
## 0.03448276
cat("p-value is less than alpha, hence H0 can be rejected.\nHence proved, On average, towns in Fairfield County have more properties listed than Hartford County")
## p-value is less than alpha, hence H0 can be rejected.
## Hence proved, On average, towns in Fairfield County have more properties listed than Hartford County
# Null hypothesis
print("H0 <- avgHC>=avgFC")
## [1] "H0 <- avgHC>=avgFC"
# Alternative hypothesis
print("H1 <- avgHC<avgFC")
## [1] "H1 <- avgHC<avgFC"
# Perform t-test as the sample size is small
t.test(tableHC$freq, alternative = "less", mu = mean(tableFC$freq), conf.level = 1-alpha)
## 
##  One Sample t-test
## 
## data:  tableHC$freq
## t = -3.2364, df = 28, p-value = 0.001552
## alternative hypothesis: true mean is less than 1673.652
## 95 percent confidence interval:
##      -Inf 1470.521
## sample estimates:
## mean of x 
##  1245.448
cat("p-value is less than alpha, hence H0 can be rejected.\nHence proved, On average, towns in Fairfield County have more properties listed than Hartford County")
## p-value is less than alpha, hence H0 can be rejected.
## Hence proved, On average, towns in Fairfield County have more properties listed than Hartford County