#clearing workspace
rm(list = ls()) # Clear environment
cat("\f") # Clear the console
First, we need to calculate the z-score for an IQ of 110 using the formula:
z = (x - μ) / σ
where x is the IQ score, μ is the mean, and σ is the standard deviation.
In this case:
x <- 110
μ <- 100
σ <- 16
z_score <- (x - μ) / σ
probability <- 1 - pnorm(z_score, lower.tail = TRUE)
probability
## [1] 0.2659855
There is a .2660 probability that a person randomly selected for jury duty has an IQ greater than or equal to 110.
x <- 110
mu <- 100
sigma <- 16
n <- 12
standard_error <- sigma / sqrt(n)
z_score <- (x - mu) / standard_error
probability <- 1 - pnorm(z_score, lower.tail = TRUE)
rounded_probability <- round(probability, 4)
rounded_probability
## [1] 0.0152
There is a .0152 probability that the mean of the 12 person jury is greater than or equal to 110.
Step 1: Null/Alternative hypothesis Step 2: Significance level Step 3: Distribution Step 4: Test Statistic from sample
Ho: mean proportion of American adults who do not have a four-year college degree is less than or equal to 0.50 Ha:mean proportion of American adults who do not have a four-year college degree is greater than 0.50 Test statistic: Distribution: t (t because sd is unknown, but could use z instead since n>30).
myp<-function(p, alpha){if (p<alpha) {print('REJECT Ho')} else {print('FAIL 2 REJECT')}}
phat <- 0.48 #sample proportion
p<- 0.50 #population hypothesized value
q<- 1-p
alpha<- 0.05
df<- n-1
n<-331
se<- sqrt(p*q/n)
t<- (phat-p)/se # t<- (sample estimate - null hypothesis value)/se
test_statistic<-t
test_statistic
## [1] -0.7277362
critical_value <- qt(p=0.05, df = n-1)
critical_value
## [1] -1.649484
p_value<-pnorm(t)
p_value
## [1] 0.2333875
myp(p = p_value, alpha = alpha)
## [1] "FAIL 2 REJECT"
We find that our critical value is -1.649484, test statistic is the t distribution, p-value is 0.2333875. We fail to reject the null hypothesis, meaning that is is likely that the true mean proportion of American adults who do not have a four-year college degree is less than or equal to 0.50.
# Parameters
sample_proportion <- 0.48
n <- 331
confidence_level <- 0.95
# Calculate standard error
se <- sqrt((sample_proportion * (1 - sample_proportion)) / n)
# Calculate critical value
critical_value <- qnorm(1 - (1 - confidence_level) / 2)
# Calculate confidence interval
lower_bound <- sample_proportion - critical_value * se
upper_bound <- sample_proportion + critical_value * se
# Print confidence interval
cat("Lower bound:", lower_bound, "\n")
## Lower bound: 0.4261784
cat("Upper bound:", upper_bound, "\n")
## Upper bound: 0.5338216
Since 0.5 (the hypothesized value) falls within this interval (0.4261784 - 0.5338216), I would expect it to be included in the confidence interval.
# Null hypothesis (H0): The average number of food items recalled by the patients in the treatment and control groups are the same.
# Alternative hypothesis (H1): The average number of food items recalled by the patients in the treatment and control groups are different.
# Given data
x1 <- 4.9
s1 <- 1.8
n1 <- 22
x2 <- 6.1
s2 <- 1.8
n2 <- 22
# Calculate t-test statistic
t_statistic <- (x1 - x2) / sqrt((s1^2 / n1) + (s2^2 / n2))
# Degrees of freedom
df <- min(n1-1, n2-1)
# Critical value for two-tailed test
alpha <- 0.05
critical_value <- qt(alpha / 2, df)
# Print t-test statistic and critical value
print(paste("t-test statistic:", t_statistic))
## [1] "t-test statistic: -2.21108319357027"
print(paste("Critical value:", critical_value))
## [1] "Critical value: -2.07961384472768"
if (abs(t_statistic) > critical_value) {
print("Reject the null hypothesis")
} else {
print("Fail to reject the null hypothesis")
}
## [1] "Reject the null hypothesis"
The data provides strong evidence that the average number of food items recalled by the patients in the treatment and control groups are different.
#defining parameters of interest
n <- 25
upper.bound <- 77
lower.bound <- 65
alpha <- .1
#determining sample mean
x.bar <- (upper.bound + lower.bound)/2
print(x.bar)
## [1] 71
Interpretation: The mean of the sample is 71.
#determining margin of error
margin.error <- (upper.bound - lower.bound)/2
print(margin.error)
## [1] 6
Interpretation: The margin of error is 6.
#determining standard error
se <- margin.error/qnorm(.95)
print(se)
## [1] 3.647741
The standard error is 3.65.
#determining sigma
sigma <- (margin.error*sqrt(n))/qt(.95, df = n - 1)
print(sigma)
## [1] 17.53481
The standard deviation is 17.53.
mym <- matrix(c(4,30,24,45), nrow=2)
colnames(mym) <- c("control", "treatment")
rownames(mym) <- c("alive","dead")
mym
## control treatment
## alive 4 24
## dead 30 45
In this case, we cannot construct a confidence interval using the normal approximation because the conditions required for the normal approximation to be valid are not met. The normal approximation assumes that the sample proportions (survival rates) are approximately normally distributed, which requires the sample sizes to be sufficiently large.
To determine if the normal approximation is valid, we need to check if the following conditions are satisfied:
Independence: The observations within each group should be independent of each other. Sample Size: The number of successes (patients alive) and failures (patients dead) within each group should be sufficiently large.
For the control group, we have 4 successes (patients alive) and 30 failures (patients dead). For the treatment group, we have 24 successes (patients alive) and 45 failures (patients dead).
Since the sample sizes for both groups are relatively small, the normal approximation is not appropriate. The proportions of survivors may not have a symmetric distribution and may not resemble a normal distribution.
If we constructed a confidence interval using the normal approximation despite these problems, we might obtain inaccurate results, leading to incorrect conclusions. The confidence interval might have incorrect coverage probability and might fail to capture the true difference in survival rates with the specified confidence level.
####The sinking of the Titanic is one of the most infamous shipwrecks in history. On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew. While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others. ####In this question, you are to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio- economic class, etc)
train <- read.csv("C:/Users/LENOVO/Downloads/Data Analytics/HW 1 Titanic/titanic/train-1.csv")
missing.observations <- colSums(is.na(train))
print(missing.observations)
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 0 0
#imputing missing values
age.source.data <- train$Age #Storing source data
age.median <- median(train$Age, na.rm = T) #Calculating median age
train$Age[is.na(train$Age)] <- median(train$Age, na.rm=TRUE) #Replacing NAs with median age
sum(is.na(train$Age)) #Check
## [1] 0
#transforming sex into a binary
train$Sex <- ifelse(train$Sex == "male",1,0)
cor(train$Survived, train$Age)
## [1] -0.06491042
Very weak, negative correlation between age and survival.
cor(train$Survived, train$Sex)
## [1] -0.5433514
Weak, negative correlation between sex and survival.
cor(train$Survived, train$Pclass)
## [1] -0.338481
Weak, negative correlation between passenger class and survival.
cor(train$Survived, train$SibSp)
## [1] -0.0353225
Very weak, negative correlation between the number of siblings and spouses aboard and survival.
cor(train$Survived, train$Parch)
## [1] 0.08162941
Very weak, positive correlation between the number of parents and children aboard and survival.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
set.seed(100)
train.subset <- sample_n(train, 500)
my.lm <- lm(train.subset$Survived ~ train.subset$Pclass + train.subset$Age + train.subset$Sex)
summary(my.lm)
##
## Call:
## lm(formula = train.subset$Survived ~ train.subset$Pclass + train.subset$Age +
## train.subset$Sex)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0791 -0.2568 -0.1164 0.2534 0.9845
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.267864 0.082326 15.401 < 2e-16 ***
## train.subset$Pclass -0.176925 0.023193 -7.628 1.23e-13 ***
## train.subset$Age -0.005937 0.001480 -4.012 6.94e-05 ***
## train.subset$Sex -0.454444 0.039011 -11.649 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4007 on 496 degrees of freedom
## Multiple R-squared: 0.3227, Adjusted R-squared: 0.3186
## F-statistic: 78.76 on 3 and 496 DF, p-value: < 2.2e-16
The coefficients suggest that for every one unit increase in Pclass, Age, and Sex, survival decreases by .177, .006, and .4544 respectively. All three coefficients are statistically significant at the .001 level. The adjusted R-squared value of .3186 suggests the model could be improved given a high amount of variability remains unexplained.
plot(my.lm)
lm(formula = train$Survived ~ train$Sex + train$Pclass + train$Age + train$SibSp + train$Parch + train$Embarked)
##
## Call:
## lm(formula = train$Survived ~ train$Sex + train$Pclass + train$Age +
## train$SibSp + train$Parch + train$Embarked)
##
## Coefficients:
## (Intercept) train$Sex train$Pclass train$Age
## 1.474072 -0.505518 -0.181208 -0.005857
## train$SibSp train$Parch train$EmbarkedC train$EmbarkedQ
## -0.039635 -0.012349 -0.096140 -0.104875
## train$EmbarkedS
## -0.165903
#predicted probabilites of survival
predictions <- predict(my.lm, newdata = train.subset, type = "response")
#defining threshold
threshold <- 0.5
#transforming predicted values into binary operators
predicted.survival <- ifelse(predictions >= threshold, yes = 1, no = 0)
table(predicted.survival)
## predicted.survival
## 0 1
## 339 161
My model predicts that of 339 out of the 500 sampled individuals did not surive the journey.
#expanding model to encompass full dataset
my.lm.2 <- lm(train$Survived ~ train$Pclass + train$Age + train$Sex)
summary(my.lm.2)
##
## Call:
## lm(formula = train$Survived ~ train$Pclass + train$Age + train$Sex)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.08688 -0.23209 -0.08626 0.22073 0.99862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.282644 0.058032 22.103 < 2e-16 ***
## train$Pclass -0.185779 0.016561 -11.218 < 2e-16 ***
## train$Age -0.004993 0.001058 -4.721 2.72e-06 ***
## train$Sex -0.499229 0.027332 -18.265 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3828 on 887 degrees of freedom
## Multiple R-squared: 0.3832, Adjusted R-squared: 0.3811
## F-statistic: 183.7 on 3 and 887 DF, p-value: < 2.2e-16
#predicted probabilites of survival
predictions.2 <- predict(my.lm.2, newdata = train, type = "response")
#defining threshold
threshold.2 <- 0.5
#transforming predicted values into binary operators
predicted.survival.2 <- ifelse(predictions.2 >= threshold.2, yes = 1, no = 0)
table(predicted.survival.2)
## predicted.survival.2
## 0 1
## 573 318
#building confusion matrix
observed <- train$Survived
outcome.table <- table(observed, predicted.survival.2)
outcome.table
## predicted.survival.2
## observed 0 1
## 0 467 82
## 1 106 236
#calculating accuracy
outcome.table.accuracy <- sum(diag(outcome.table)) / sum(outcome.table)
print(outcome.table.accuracy)
## [1] 0.7890011
When extending the model to the full train dataset, the model accurately classified 79% of passengers.
accuracy ? #### 5. Organise the 4 subparts above into a 1 page report
library(psych)
describe(train)
## vars n mean sd median trimmed mad min max range
## PassengerId 1 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Survived 2 891 0.38 0.49 0.00 0.35 0.00 0.00 1.00 1.00
## Pclass 3 891 2.31 0.84 3.00 2.39 0.00 1.00 3.00 2.00
## Name* 4 891 446.00 257.35 446.00 446.00 330.62 1.00 891.00 890.00
## Sex 5 891 0.65 0.48 1.00 0.68 0.00 0.00 1.00 1.00
## Age 6 891 29.36 13.02 28.00 28.83 8.90 0.42 80.00 79.58
## SibSp 7 891 0.52 1.10 0.00 0.27 0.00 0.00 8.00 8.00
## Parch 8 891 0.38 0.81 0.00 0.18 0.00 0.00 6.00 6.00
## Ticket* 9 891 339.52 200.83 338.00 339.65 268.35 1.00 681.00 680.00
## Fare 10 891 32.20 49.69 14.45 21.38 10.24 0.00 512.33 512.33
## Cabin* 11 891 18.63 38.14 1.00 8.29 0.00 1.00 148.00 147.00
## Embarked* 12 891 3.53 0.80 4.00 3.66 0.00 1.00 4.00 3.00
## skew kurtosis se
## PassengerId 0.00 -1.20 8.62
## Survived 0.48 -1.77 0.02
## Pclass -0.63 -1.28 0.03
## Name* 0.00 -1.20 8.62
## Sex -0.62 -1.62 0.02
## Age 0.51 0.97 0.44
## SibSp 3.68 17.73 0.04
## Parch 2.74 9.69 0.03
## Ticket* 0.00 -1.28 6.73
## Fare 4.77 33.12 1.66
## Cabin* 2.09 3.07 1.28
## Embarked* -1.27 -0.16 0.03
boxplot(train$Age~train$Survived, notch=TRUE, horizontal=T,
xlab = "Age",
ylab = "Survived",
main = "Age & Survival Rate"
)
train.2 <- as.data.frame(train)
#binning age varible
bins <- c(0,18,50,80)
labels <- c("Youth", "Adult", "Old")
train.2 <- train.2 %>%
mutate(age.categories = factor(cut(Age, breaks = bins, labels = labels)))
To test my assumption, I built a logistical regression model which controlled for Pclass, a new Age variable (binned ages), Sex, Parch, and SibSp variables. I added the Parch variable as a control variable under the assumption that passengers with children aboard would behave differently than passengers traveling alone - parents will likely prioritize the welfare of their children which could change the odds of survival. Similar logic justifies the inclusion of the SibSp variable - passengers may have behaved differently if “loved ones” were aboard, which could be a factor in survival odds.
my.logit <- glm(Survived ~ Pclass + age.categories + Sex + Parch + SibSp, data = train.2, family = "binomial")
summary(my.logit)
##
## Call:
## glm(formula = Survived ~ Pclass + age.categories + Sex + Parch +
## SibSp, family = "binomial", data = train.2)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.7190 0.4418 10.680 < 2e-16 ***
## Pclass -1.0759 0.1143 -9.409 < 2e-16 ***
## age.categoriesAdult -0.9888 0.2629 -3.761 0.000169 ***
## age.categoriesOld -1.7884 0.4282 -4.177 2.96e-05 ***
## Sex -2.7486 0.1978 -13.899 < 2e-16 ***
## Parch -0.1060 0.1132 -0.936 0.349383
## SibSp -0.3319 0.1092 -3.040 0.002367 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 797.97 on 884 degrees of freedom
## AIC: 811.97
##
## Number of Fisher Scoring iterations: 5
The magnitude of significance appears similar to the original OLS model - Pclass and Sex remain statistically significant. Interestingly, both the “adult” and “old” categorizations of the age variable are statistically significant at the .001 level. My assumption around SibSp also appears to hold - the presence of siblings or spouses aboard is statistically significant at the .01 level. However, the model does challenge my assumption given the logged odds of survival decreases with additional siblings or spouses aboard (same dynamic for the Parch variable).
The difference between the Null and Residual deviance (388.69) indicates my model is explaining a large portion of variability, however, the high AIC value suggests I may have over-fit the model.
#predicted probabilites of survival
predictions.2 <- predict(my.logit, newdata = train.2, type = "response")
#defining threshold
threshold.2 <- 0.5
#transforming predicted values into binary operators
predicted.survival.2 <- ifelse(predictions.2 >= threshold, yes = 1, no = 0)
table(predicted.survival.2)
## predicted.survival.2
## 0 1
## 597 294
#building confucsion matrix
observed.2 <- train.2$Survived
outcome.table.2 <- table(observed.2, predicted.survival.2)
outcome.table.2
## predicted.survival.2
## observed.2 0 1
## 0 486 63
## 1 111 231
#dtermining accuracy
outcome.table.accuracy.2 <- sum(diag(outcome.table.2)) / sum(outcome.table.2)
print(outcome.table.accuracy.2)
## [1] 0.8047138
The accuracy of the old OLS model was somewhat higher with my new model (80% vs. 79%). Adding socioeconomic characteristics (based on Fare and Pclass) to the passenger classification process could increase accuracy. Concatenating age and sex, two highly statistically significant variables that may combined be a more accurate predictor of survival, is an other strategy that might be used.
# Import telecom_customer_churn dataset
telecom_customer_churn <- read.csv("telecom_customer_churn.csv")
head(telecom_customer_churn)
## Customer.ID Gender Age Married Number.of.Dependents City ZIPCODE
## 1 0002-ORFBO Female 37 Yes 0 Frazier Park 93225
## 2 0003-MKNFE Male 46 No 0 Glendale 91206
## 3 0004-TLHLJ Male 50 No 0 Costa Mesa 92627
## 4 0011-IGKFF Male 78 Yes 0 Martinez 94553
## 5 0013-EXCHZ Female 75 Yes 0 Camarillo 93010
## 6 0013-MHZWF Female 23 No 3 Midpines 95345
## Latitude Longitude Number.of.Referrals Tenure.in.Months Offer Phone.Service
## 1 34.82766 -118.9991 2 9 None Yes
## 2 34.16251 -118.2039 0 9 None Yes
## 3 33.64567 -117.9226 0 4 Offer E Yes
## 4 38.01446 -122.1154 1 13 Offer D Yes
## 5 34.22785 -119.0799 3 3 None Yes
## 6 37.58150 -119.9728 0 9 Offer E Yes
## Avg.Monthly.Long.Distance.Charges Multiple.Lines Internet.Service
## 1 42.39 No Yes
## 2 10.69 Yes Yes
## 3 33.65 No Yes
## 4 27.82 No Yes
## 5 7.38 No Yes
## 6 16.77 No Yes
## Internet.Type Avg.Monthly.GB.Download Online.Security Online.Backup
## 1 Cable 16 No Yes
## 2 Cable 10 No No
## 3 Fiber Optic 30 No No
## 4 Fiber Optic 4 No Yes
## 5 Fiber Optic 11 No No
## 6 Cable 73 No No
## Device.Protection.Plan Premium.Tech.Support Streaming.TV Streaming.Movies
## 1 No Yes Yes No
## 2 No No No Yes
## 3 Yes No No No
## 4 Yes No Yes Yes
## 5 No Yes Yes No
## 6 No Yes Yes Yes
## Streaming.Music Unlimited.Data Contract Paperless.Billing
## 1 No Yes One Year Yes
## 2 Yes No Month-to-Month No
## 3 No Yes Month-to-Month Yes
## 4 No Yes Month-to-Month Yes
## 5 No Yes Month-to-Month Yes
## 6 Yes Yes Month-to-Month Yes
## Payment.Method Monthly.Charge Total.Charges Total.Refunds
## 1 Credit Card 65.6 593.30 0.00
## 2 Credit Card -4.0 542.40 38.33
## 3 Bank Withdrawal 73.9 280.85 0.00
## 4 Bank Withdrawal 98.0 1237.85 0.00
## 5 Credit Card 83.9 267.40 0.00
## 6 Credit Card 69.4 571.45 0.00
## Total.Extra.Data.Charges Total.Long.Distance.Charges Total.Revenue
## 1 0 381.51 974.81
## 2 10 96.21 610.28
## 3 0 134.60 415.45
## 4 0 361.66 1599.51
## 5 0 22.14 289.54
## 6 0 150.93 722.38
## Customer.Status Churn.Category Churn.Reason
## 1 Stayed
## 2 Stayed
## 3 Churned Competitor Competitor had better devices
## 4 Churned Dissatisfaction Product dissatisfaction
## 5 Churned Dissatisfaction Network reliability
## 6 Stayed
# Import telecom_zipcode_population dataset
telecom_zipcode_population <- read.csv("telecom_zipcode_population.csv")
head(telecom_zipcode_population)
## ZIPCODE Population
## 1 90001 54492
## 2 90002 44586
## 3 90003 58198
## 4 90004 67852
## 5 90005 43019
## 6 90006 62784
# Load required libraries
library(dplyr)
# Import datasets
telecom_customer_churn <- read.csv("telecom_customer_churn.csv")
telecom_zipcode_population <- read.csv("telecom_zipcode_population.csv")
# Merge datasets based on geographic variable (ZIPCODE)
merged_data <- telecom_zipcode_population %>%
left_join(telecom_customer_churn, by = c("ZIPCODE"))
# Calculate average population and average age
average_data <- merged_data %>%
group_by(ZIPCODE) %>%
summarise(average_population = mean(Population),
average_age = mean(Age))
# Print the resulting table
average_data
## # A tibble: 1,671 × 3
## ZIPCODE average_population average_age
## <int> <dbl> <dbl>
## 1 90001 54492 52.2
## 2 90002 44586 50.2
## 3 90003 58198 46.2
## 4 90004 67852 39.4
## 5 90005 43019 42.2
## 6 90006 62784 37
## 7 90007 45025 43.8
## 8 90008 30852 45.6
## 9 90010 1957 30
## 10 90011 101215 38.4
## # ℹ 1,661 more rows