# Load libraries
library(readxl)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
# Import Excel file
pathogen <- read_excel("pathogen_dataset.xlsx")
# Inspect structure
str(pathogen)
## tibble [3,040 × 31] (S3: tbl_df/tbl/data.frame)
## $ Var1 : num [1:3040] 63.5 46.1 60.3 49.3 52.1 ...
## $ Var2 : num [1:3040] 37 34.2 64.9 54.4 47.3 ...
## $ Var3 : num [1:3040] 37.5 58.1 46.7 30.4 60.2 ...
## $ Var4 : num [1:3040] 57.2 36.5 62.3 52.9 51.4 ...
## $ Var5 : num [1:3040] 59.7 59.5 41.5 75 35.1 ...
## $ Var6 : num [1:3040] 64.2 54.9 53.3 64.9 53.9 ...
## $ Var7 : num [1:3040] 46.5 54.2 61.3 37.9 52.3 ...
## $ Var8 : num [1:3040] 65.2 51.3 72.6 51.3 71.7 ...
## $ Var9 : num [1:3040] 54.9 37.9 32.2 55 41.9 ...
## $ Var10 : num [1:3040] 38.9 59.7 44.4 60.2 55.8 ...
## $ Var11 : num [1:3040] 54.5 24.6 49.2 47.5 47.9 ...
## $ Var12 : num [1:3040] 30.1 43.9 44.4 51.3 49.8 ...
## $ Var13 : num [1:3040] 28.7 50.3 49.3 68.6 47.4 ...
## $ Var14 : num [1:3040] 33.3 47.2 31.5 52.7 62.9 ...
## $ Var15 : num [1:3040] 48.8 42.6 48.8 62.8 41.7 ...
## $ Var16 : num [1:3040] 49.4 41.1 56.4 47.1 49.6 ...
## $ Var17 : num [1:3040] 58.3 51 51.5 49.5 62.3 ...
## $ Var18 : num [1:3040] 40.7 39.7 58.4 57.3 29.1 ...
## $ Var19 : num [1:3040] 53.2 57.8 47.9 56 53.1 ...
## $ Var20 : num [1:3040] 50.5 51.9 41.1 53 46.7 ...
## $ ID : num [1:3040] 1782 1860 2878 2297 1492 ...
## $ Year : num [1:3040] 2013 2011 2023 2020 2019 ...
## $ Sex : chr [1:3040] "Female" "Male" "Female" "Male" ...
## $ Education : chr [1:3040] "Tertiary" "None" "Primary" "Secondary" ...
## $ WaterSource : chr [1:3040] "Borehole" "Borehole" "Tap" "Tap" ...
## $ BoiledWater : chr [1:3040] "No" "No" "No" "No" ...
## $ HandWashing : chr [1:3040] "No" "Yes" "Yes" "No" ...
## $ ToiletType : chr [1:3040] "Pit Latrine" "Pit Latrine" "Flush" "Pit Latrine" ...
## $ Age : num [1:3040] 23 59 21 26 28 64 39 76 19 19 ...
## $ Site : chr [1:3040] "Hospital B" "Hospital B" "Hospital A" "Clinic C" ...
## $ bact_isolated: chr [1:3040] "None" "E.coli" "None" "E.coli" ...
# number of variables
num_var <- ncol(pathogen)
cat("\n The number of variables in the dataset is :", num_var, "\n")
##
## The number of variables in the dataset is : 31
# number of observations
num_obs <- nrow(pathogen)
cat("\n The number of observation in the dataset is :", num_obs, "\n")
##
## The number of observation in the dataset is : 3040
# dim(pathogen) return a vector with first number is number of observation and second number is number of variables
In the data clean, I will check the following: (1) duplicate, (2) missing values, and (3) outliers (4) Data formatting (5) Correcting data types issues
cat("The dimension of original dataset is:", dim(pathogen),"\n\n")
## The dimension of original dataset is: 3040 31
# total number of duplicated data
cat("The total number of duplicated data is :", sum(duplicated(pathogen)), "\n\n")
## The total number of duplicated data is : 40
# Removing the duplicated data
pathogen_cleaned <- unique(pathogen)
cat("The dimension of cleaned dataset after removing duplicates is:", dim(pathogen_cleaned),"\n")
## The dimension of cleaned dataset after removing duplicates is: 3000 31
# Identifying the missing values and calculation the total missing data in each column
colSums(is.na(pathogen_cleaned))
## Var1 Var2 Var3 Var4 Var5
## 0 0 0 0 150
## Var6 Var7 Var8 Var9 Var10
## 0 0 0 0 0
## Var11 Var12 Var13 Var14 Var15
## 0 0 0 0 0
## Var16 Var17 Var18 Var19 Var20
## 0 0 0 0 0
## ID Year Sex Education WaterSource
## 0 0 50 30 0
## BoiledWater HandWashing ToiletType Age Site
## 0 0 0 0 0
## bact_isolated
## 0
Interpretation: This result shows that they are 50 missing value in sex and 30 in Education. Next step is to remove the rows with missing values.
pathogen_cleaned_1 <- na.omit(pathogen_cleaned)
dim(pathogen_cleaned_1)
## [1] 2775 31
Let me observe in the summary to see if there is no abnormal data points or outliers is the dataset
summary(pathogen_cleaned_1)
## Var1 Var2 Var3 Var4
## Min. : 16.28 Min. :16.69 Min. : 9.567 Min. :15.47
## 1st Qu.: 43.43 1st Qu.:43.01 1st Qu.:42.911 1st Qu.:43.09
## Median : 50.00 Median :49.95 Median :49.992 Median :49.62
## Mean : 50.84 Mean :50.01 Mean :49.863 Mean :49.83
## 3rd Qu.: 56.88 3rd Qu.:56.89 3rd Qu.:56.642 3rd Qu.:56.57
## Max. :166.20 Max. :82.99 Max. :93.281 Max. :83.39
## Var5 Var6 Var7 Var8
## Min. :15.60 Min. :16.23 Min. :14.93 Min. :13.57
## 1st Qu.:43.43 1st Qu.:43.08 1st Qu.:43.13 1st Qu.:43.22
## Median :50.09 Median :50.12 Median :50.21 Median :50.07
## Mean :50.00 Mean :49.93 Mean :50.13 Mean :50.17
## 3rd Qu.:56.80 3rd Qu.:56.58 3rd Qu.:57.16 3rd Qu.:57.29
## Max. :86.63 Max. :81.98 Max. :82.26 Max. :84.94
## Var9 Var10 Var11 Var12
## Min. :12.84 Min. :14.47 Min. :13.77 Min. :13.50
## 1st Qu.:43.15 1st Qu.:42.93 1st Qu.:43.49 1st Qu.:42.81
## Median :50.34 Median :49.59 Median :50.45 Median :49.86
## Mean :50.09 Mean :49.88 Mean :50.27 Mean :49.69
## 3rd Qu.:56.92 3rd Qu.:56.62 3rd Qu.:57.03 3rd Qu.:56.61
## Max. :90.66 Max. :87.19 Max. :85.91 Max. :85.08
## Var13 Var14 Var15 Var16
## Min. :14.96 Min. :14.66 Min. :17.67 Min. :18.16
## 1st Qu.:42.83 1st Qu.:43.31 1st Qu.:43.20 1st Qu.:43.12
## Median :50.07 Median :49.97 Median :49.87 Median :49.75
## Mean :50.00 Mean :49.99 Mean :49.97 Mean :49.71
## 3rd Qu.:56.83 3rd Qu.:56.50 3rd Qu.:56.67 3rd Qu.:56.35
## Max. :88.76 Max. :86.26 Max. :83.98 Max. :81.84
## Var17 Var18 Var19 Var20
## Min. :16.40 Min. :10.97 Min. :15.82 Min. :17.54
## 1st Qu.:43.32 1st Qu.:43.18 1st Qu.:43.35 1st Qu.:43.05
## Median :50.04 Median :49.98 Median :50.31 Median :49.93
## Mean :49.83 Mean :49.97 Mean :50.21 Mean :49.86
## 3rd Qu.:56.65 3rd Qu.:56.52 3rd Qu.:56.76 3rd Qu.:56.80
## Max. :84.86 Max. :81.99 Max. :82.71 Max. :80.40
## ID Year Sex Education
## Min. : 1.0 Min. :2010 Length :2775 Length :2775
## 1st Qu.: 743.5 1st Qu.:2014 N.unique : 2 N.unique : 4
## Median :1485.0 Median :2018 N.blank : 0 N.blank : 0
## Mean :1492.7 Mean :2018 Min.nchar: 4 Min.nchar: 4
## 3rd Qu.:2240.5 3rd Qu.:2022 Max.nchar: 6 Max.nchar: 9
## Max. :3000.0 Max. :2025
## WaterSource BoiledWater HandWashing ToiletType
## Length :2775 Length :2775 Length :2775 Length :2775
## N.unique : 4 N.unique : 2 N.unique : 2 N.unique : 3
## N.blank : 0 N.blank : 0 N.blank : 0 N.blank : 0
## Min.nchar: 3 Min.nchar: 2 Min.nchar: 2 Min.nchar: 4
## Max.nchar: 8 Max.nchar: 3 Max.nchar: 3 Max.nchar: 11
##
## Age Site bact_isolated
## Min. : 1.00 Length :2775 Length :2775
## 1st Qu.:20.00 N.unique : 3 N.unique : 5
## Median :40.00 N.blank : 0 N.blank : 0
## Mean :40.02 Min.nchar: 8 Min.nchar: 4
## 3rd Qu.:60.00 Max.nchar: 10 Max.nchar: 10
## Max. :80.00
Outliers are easily checked though boxplot
# removing outliers in Var1 since it has data points above 100 while other variables the maximun is around 90
boxplot(pathogen_cleaned_1$Age,main = "Boxplot of Age to check outliers")
pathogen_cleaned_2 <- pathogen_cleaned_1 %>%
filter(Var1 >20 & Var1 < 80)
#pathogen_cleaned_2 <- subset(pathogen_cleaned_1, Var1 < 90)
boxplot(pathogen_cleaned_2[,1:20])
dim(pathogen_cleaned_2)
## [1] 2741 31
library(dplyr)
data_cleaned <- pathogen_cleaned_1 %>%
# Apply this rule to ALL numeric columns
filter(if_all(where(is.numeric), ~
# Keep if the value is within the IQR bounds OR if it is NA
between(.,
quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE),
quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE)
) | is.na(.)
))
# Check how many rows were removed
cat("Original rows:", nrow(pathogen_cleaned_1), "\n")
## Original rows: 2775
cat("Rows after outlier removal:", nrow(pathogen_cleaned_2), "\n")
## Rows after outlier removal: 2741
boxplot(data_cleaned[,1:20])
## Scatter plot
plot(data_cleaned$Var1, data_cleaned$Var2)
library(dplyr)
cleaned_1 <- pathogen_cleaned_1 %>%
filter(Var1 <= 100)
dim(cleaned_1)
## [1] 2749 31
boxplot(cleaned_1[,1:20])
hist(cleaned_1$Var1)
#library(dplyr)
#pathogen_cleaned_2 <- pathogen_cleaned_1 %>%
# Remove outliers in Var1
# filter(
# Var1 >= (quantile(Var1, 0.25, na.rm = TRUE) - 1.5 * IQR(Var1, na.rm = TRUE)) &
# Var1 <= (quantile(Var1, 0.75, na.rm = TRUE) + 1.5 * IQR(Var1, na.rm = TRUE))
# )
#dim(pathogen_cleaned_2)
#boxplot(pathogen_cleaned_2$Var1, main = "Boxplot of Var1 after removing outliers")
#dim(pathogen_cleaned_1)
barplot(table(data_cleaned$bact_isolated),
xlab ="Bacteria Isolated",
ylab ="Counts",
main ="Prevalence of Bacteria Isolated",
col ="blue",
las=2,
border="black"
)
ggplot(data_cleaned, aes(x = bact_isolated)) +
geom_bar(fill = "steelblue") +
labs(title = "Prevalence of Bacteria Isolated",
x = "Bacteria Isolated", y = "Count")
### Task 5: Display a side by side bar chart that shows the distribution
of bact_isolated by sites
barplot(table(data_cleaned$bact_isolated, data_cleaned$Site),
col=c("blue","red","purple","orange", "pink"),
beside = TRUE,
xlab ="Sites",
ylab= "Counts",
main = "Distribution of Bacteria Isolated by sites",
legend.text = TRUE,
args.legend = list(x = "topright", inset = c(-0.1, 0), bty = "n"))
ggplot(data_cleaned, aes(x = Site, fill = bact_isolated)) +
geom_bar(position = "dodge") +
labs(title = "Distribution of Bacteria Isolated by Site",
x = "Site", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Filter the data for 2014 onwards
data_2014 <- subset(data_cleaned, data_cleaned$Year>=2014)
barplot(table(data_2014$bact_isolated, data_2014$Year ),
beside = TRUE,
col= c("blue","red", "purple", "pink"),
xlab ="Year",
ylab ="count",
las = 1,
main = "Bacteria isolated trend from 2014 onwards",
legend.text = TRUE,
args.legend = list(x = "topright", inset = c(-0.140, 0), bty = "n"))
# Trend from 2014 onward
ggplot(data_cleaned %>% filter(Year >= 2014),
aes(x = Year, fill = bact_isolated)) +
geom_bar(position = "dodge") +
labs(title = "Trend of Bacteria Isolated (2014+)",
x = "Year", y = "Count")
### Task 7: Create two dataset of observation of 1) under 16 years
(Young) and 2) above 16 years old (adults) and compare the bact_isolated
prevalence among the young and adults. Note that the number of
observations in each analysis should remailn the same since you have
removed outliers.
#Young vs Adults
young <- subset(data_cleaned, data_cleaned$Age<16)
adult <- subset(data_cleaned, data_cleaned$Age>=16)
young_prev <- table(young$bact_isolated)
adult_prev <- table(adult$bact_isolated)
barplot(rbind(young_prev,adult_prev),
col = c("blue","purple"),
beside =TRUE,
main = "Comparison of young and adult prevalence of bacteria isolated",
legend = c("Young (<16)", "Adults (≥16)"))
young <- pathogen %>% filter(Age < 16)
adults <- pathogen %>% filter(Age >= 16)
young_prev <- table(young$bact_isolated)
adults_prev <- table(adults$bact_isolated)
barplot(rbind(young_prev, adults_prev),
beside = TRUE,
col = c("skyblue", "orange"),
legend = c("Young (<16)", "Adults (≥16)"),
main = "Bacteria Isolated Prevalence: Young vs Adults")
### Task 8: Model Building
There is a need of converting categorical variables into numbers as: (1) Sex (Male:1, Female:0) (2) Education: (None: 0, Primary:1, Seconday:2, tertiary:3 ) (3) BoiledWater: (Yes:1, No:0)
# Convert categorical variables into numeric codes
# Sex (Male = 1, Female = 0)
data_cleaned$Sex_num <- ifelse(data_cleaned$Sex == "Male", 1, 0)
# Education (No = 0, Primary = 1, Secondary = 2, Tertiary = 3)
# Adjust levels based on your dataset
data_cleaned$Education_num <- recode(data_cleaned$Education,
"None" = 0,
"Primary" = 1,
"Secondary" = 2,
"Tertiary" = 3)
#BoiledWater (Yes = 1, No = 0)
data_cleaned$BoiledWater_num <- ifelse(data_cleaned$BoiledWater == "Yes", 1, 0)
# Check the transformed variables
#head(data_cleaned[, c("Sex", "Sex_num", "Education", "Education_num", "BoiledWater", #"BoiledWater_num")])
library(dplyr)
# Select the response AND the predictors into one dataframe
model_data <- data_cleaned %>%
select(Age,Var1, Var2, Var3, Var4, Var5, Sex_num, Education_num, BoiledWater_num)
# Fit the model using "." to represent all other columns as predictors
model_full <- lm(Var1 ~ ., data = model_data)
# View the results
summary(model_full)
##
## Call:
## lm(formula = Var1 ~ ., data = model_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.9958 -6.5727 0.0368 6.5864 26.6960
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.979489 2.102561 23.295 <2e-16 ***
## Age 0.007282 0.008608 0.846 0.398
## Var2 0.017158 0.019930 0.861 0.389
## Var3 0.015716 0.020371 0.772 0.440
## Var4 -0.006975 0.020238 -0.345 0.730
## Var5 -0.019472 0.020468 -0.951 0.342
## Sex_num 0.377194 0.399158 0.945 0.345
## Education_num 0.199432 0.176430 1.130 0.258
## BoiledWater_num -0.182808 0.399371 -0.458 0.647
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.791 on 2405 degrees of freedom
## Multiple R-squared: 0.002218, Adjusted R-squared: -0.001101
## F-statistic: 0.6684 on 8 and 2405 DF, p-value: 0.7198
# Correlection within the model parameters
# Correlation matrix for numeric variables
cor(model_data, use = "complete.obs")
## Age Var1 Var2 Var3 Var4
## Age 1.000000000 0.017567868 -0.01122965 0.016218420 -0.018830359
## Var1 0.017567868 1.000000000 0.01705973 0.016199080 -0.008241871
## Var2 -0.011229651 0.017059728 1.00000000 0.041175213 -0.008441420
## Var3 0.016218420 0.016199080 0.04117521 1.000000000 -0.044683285
## Var4 -0.018830359 -0.008241871 -0.00844142 -0.044683285 1.000000000
## Var5 0.015524624 -0.018056091 0.01308902 0.002165681 0.009372565
## Sex_num -0.001243651 0.018380705 -0.01089185 -0.024955730 -0.028061307
## Education_num 0.018759165 0.022408447 -0.03159557 -0.013564094 0.028962350
## BoiledWater_num -0.003024812 -0.008706729 -0.01845371 0.003659996 -0.002607490
## Var5 Sex_num Education_num BoiledWater_num
## Age 0.015524624 -0.001243651 0.018759165 -0.003024812
## Var1 -0.018056091 0.018380705 0.022408447 -0.008706729
## Var2 0.013089023 -0.010891851 -0.031595568 -0.018453713
## Var3 0.002165681 -0.024955730 -0.013564094 0.003659996
## Var4 0.009372565 -0.028061307 0.028962350 -0.002607490
## Var5 1.000000000 0.022532410 -0.004372270 -0.060690726
## Sex_num 0.022532410 1.000000000 -0.005513267 -0.009093307
## Education_num -0.004372270 -0.005513267 1.000000000 -0.002919669
## BoiledWater_num -0.060690726 -0.009093307 -0.002919669 1.000000000
# Fit the full model
model_full_1 <- lm(Var1 ~ Var2 + Var3 + Var4 + Var5 + Age + Sex_num + Education_num + BoiledWater_num,
data = data_cleaned)
summary(model_full_1)
##
## Call:
## lm(formula = Var1 ~ Var2 + Var3 + Var4 + Var5 + Age + Sex_num +
## Education_num + BoiledWater_num, data = data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.9958 -6.5727 0.0368 6.5864 26.6960
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.979489 2.102561 23.295 <2e-16 ***
## Var2 0.017158 0.019930 0.861 0.389
## Var3 0.015716 0.020371 0.772 0.440
## Var4 -0.006975 0.020238 -0.345 0.730
## Var5 -0.019472 0.020468 -0.951 0.342
## Age 0.007282 0.008608 0.846 0.398
## Sex_num 0.377194 0.399158 0.945 0.345
## Education_num 0.199432 0.176430 1.130 0.258
## BoiledWater_num -0.182808 0.399371 -0.458 0.647
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.791 on 2405 degrees of freedom
## Multiple R-squared: 0.002218, Adjusted R-squared: -0.001101
## F-statistic: 0.6684 on 8 and 2405 DF, p-value: 0.7198
Interpretations: Look at the p‑values: predictors with very high p‑values (e.g., >0.05 or >0.1) may not contribute much.
#Actual vs predicted
predicted_full <- predict(model_full)
plot(predicted_full,
data_cleaned$Var1,
col="blue",
pch = 19,
xlab = "Actual",
ylab = "Predicted",
main = "Actual vs fitted plot"
)
abline(0,1, col="red", lws=2)
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "lws" is
## not a graphical parameter
Interpretation of the graph: This confirms that there
no relationship between the response and the predictors variables.
You can use automated selection to refine predictors: ### Forward selection method
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
stepAIC(model_full, direction = "forward")
## Start: AIC=11023.84
## Var1 ~ Age + Var2 + Var3 + Var4 + Var5 + Sex_num + Education_num +
## BoiledWater_num
##
## Call:
## lm(formula = Var1 ~ Age + Var2 + Var3 + Var4 + Var5 + Sex_num +
## Education_num + BoiledWater_num, data = model_data)
##
## Coefficients:
## (Intercept) Age Var2 Var3
## 48.979489 0.007282 0.017158 0.015716
## Var4 Var5 Sex_num Education_num
## -0.006975 -0.019472 0.377194 0.199432
## BoiledWater_num
## -0.182808
stepAIC(model_full, direction = "backward")
## Start: AIC=11023.84
## Var1 ~ Age + Var2 + Var3 + Var4 + Var5 + Sex_num + Education_num +
## BoiledWater_num
##
## Df Sum of Sq RSS AIC
## - Var4 1 11.386 230559 11022
## - BoiledWater_num 1 20.085 230568 11022
## - Var3 1 57.058 230605 11022
## - Age 1 68.597 230616 11023
## - Var2 1 71.048 230619 11023
## - Sex_num 1 85.602 230633 11023
## - Var5 1 86.753 230635 11023
## - Education_num 1 122.487 230670 11023
## <none> 230548 11024
##
## Step: AIC=11021.96
## Var1 ~ Age + Var2 + Var3 + Var5 + Sex_num + Education_num + BoiledWater_num
##
## Df Sum of Sq RSS AIC
## - BoiledWater_num 1 20.020 230579 11020
## - Var3 1 59.465 230619 11021
## - Age 1 69.686 230629 11021
## - Var2 1 71.416 230631 11021
## - Var5 1 87.424 230647 11021
## - Sex_num 1 87.523 230647 11021
## - Education_num 1 120.471 230680 11021
## <none> 230559 11022
##
## Step: AIC=11020.17
## Var1 ~ Age + Var2 + Var3 + Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Var3 1 59.168 230638 11019
## - Age 1 69.858 230649 11019
## - Var2 1 72.814 230652 11019
## - Var5 1 82.755 230662 11019
## - Sex_num 1 88.187 230667 11019
## - Education_num 1 120.837 230700 11019
## <none> 230579 11020
##
## Step: AIC=11018.79
## Var1 ~ Age + Var2 + Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Age 1 72.065 230710 11018
## - Var2 1 78.379 230717 11018
## - Var5 1 82.494 230721 11018
## - Sex_num 1 84.718 230723 11018
## - Education_num 1 118.713 230757 11018
## <none> 230638 11019
##
## Step: AIC=11017.55
## Var1 ~ Var2 + Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Var2 1 76.764 230787 11016
## - Var5 1 80.098 230791 11016
## - Sex_num 1 84.465 230795 11016
## - Education_num 1 122.196 230833 11017
## <none> 230710 11018
##
## Step: AIC=11016.35
## Var1 ~ Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Var5 1 78.053 230865 11015
## - Sex_num 1 82.655 230870 11015
## - Education_num 1 116.267 230903 11016
## <none> 230787 11016
##
## Step: AIC=11015.17
## Var1 ~ Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Sex_num 1 79.119 230944 11014
## - Education_num 1 117.080 230982 11014
## <none> 230865 11015
##
## Step: AIC=11013.99
## Var1 ~ Education_num
##
## Df Sum of Sq RSS AIC
## - Education_num 1 116.02 231060 11013
## <none> 230944 11014
##
## Step: AIC=11013.21
## Var1 ~ 1
##
## Call:
## lm(formula = Var1 ~ 1, data = model_data)
##
## Coefficients:
## (Intercept)
## 49.98
stepAIC(model_full, direction = "both")
## Start: AIC=11023.84
## Var1 ~ Age + Var2 + Var3 + Var4 + Var5 + Sex_num + Education_num +
## BoiledWater_num
##
## Df Sum of Sq RSS AIC
## - Var4 1 11.386 230559 11022
## - BoiledWater_num 1 20.085 230568 11022
## - Var3 1 57.058 230605 11022
## - Age 1 68.597 230616 11023
## - Var2 1 71.048 230619 11023
## - Sex_num 1 85.602 230633 11023
## - Var5 1 86.753 230635 11023
## - Education_num 1 122.487 230670 11023
## <none> 230548 11024
##
## Step: AIC=11021.96
## Var1 ~ Age + Var2 + Var3 + Var5 + Sex_num + Education_num + BoiledWater_num
##
## Df Sum of Sq RSS AIC
## - BoiledWater_num 1 20.020 230579 11020
## - Var3 1 59.465 230619 11021
## - Age 1 69.686 230629 11021
## - Var2 1 71.416 230631 11021
## - Var5 1 87.424 230647 11021
## - Sex_num 1 87.523 230647 11021
## - Education_num 1 120.471 230680 11021
## <none> 230559 11022
## + Var4 1 11.386 230548 11024
##
## Step: AIC=11020.17
## Var1 ~ Age + Var2 + Var3 + Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Var3 1 59.168 230638 11019
## - Age 1 69.858 230649 11019
## - Var2 1 72.814 230652 11019
## - Var5 1 82.755 230662 11019
## - Sex_num 1 88.187 230667 11019
## - Education_num 1 120.837 230700 11019
## <none> 230579 11020
## + BoiledWater_num 1 20.020 230559 11022
## + Var4 1 11.320 230568 11022
##
## Step: AIC=11018.79
## Var1 ~ Age + Var2 + Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Age 1 72.065 230710 11018
## - Var2 1 78.379 230717 11018
## - Var5 1 82.494 230721 11018
## - Sex_num 1 84.718 230723 11018
## - Education_num 1 118.713 230757 11018
## <none> 230638 11019
## + Var3 1 59.168 230579 11020
## + BoiledWater_num 1 19.723 230619 11021
## + Var4 1 13.715 230625 11021
##
## Step: AIC=11017.55
## Var1 ~ Var2 + Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Var2 1 76.764 230787 11016
## - Var5 1 80.098 230791 11016
## - Sex_num 1 84.465 230795 11016
## - Education_num 1 122.196 230833 11017
## <none> 230710 11018
## + Age 1 72.065 230638 11019
## + Var3 1 61.374 230649 11019
## + BoiledWater_num 1 19.891 230691 11019
## + Var4 1 14.974 230695 11019
##
## Step: AIC=11016.35
## Var1 ~ Var5 + Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Var5 1 78.053 230865 11015
## - Sex_num 1 82.655 230870 11015
## - Education_num 1 116.267 230903 11016
## <none> 230787 11016
## + Var2 1 76.764 230710 11018
## + Age 1 70.449 230717 11018
## + Var3 1 66.951 230720 11018
## + BoiledWater_num 1 21.308 230766 11018
## + Var4 1 15.519 230772 11018
##
## Step: AIC=11015.17
## Var1 ~ Sex_num + Education_num
##
## Df Sum of Sq RSS AIC
## - Sex_num 1 79.119 230944 11014
## - Education_num 1 117.080 230982 11014
## <none> 230865 11015
## + Var5 1 78.053 230787 11016
## + Var2 1 74.719 230791 11016
## + Age 1 68.132 230797 11016
## + Var3 1 66.565 230799 11016
## + BoiledWater_num 1 16.589 230849 11017
## + Var4 1 16.231 230849 11017
##
## Step: AIC=11013.99
## Var1 ~ Education_num
##
## Df Sum of Sq RSS AIC
## - Education_num 1 116.024 231060 11013
## <none> 230944 11014
## + Sex_num 1 79.119 230865 11015
## + Var5 1 74.517 230870 11015
## + Var2 1 73.017 230871 11015
## + Age 1 67.964 230876 11015
## + Var3 1 62.941 230881 11015
## + Var4 1 18.280 230926 11016
## + BoiledWater_num 1 17.254 230927 11016
##
## Step: AIC=11013.21
## Var1 ~ 1
##
## Df Sum of Sq RSS AIC
## <none> 231060 11013
## + Education_num 1 116.024 230944 11014
## + Sex_num 1 78.064 230982 11014
## + Var5 1 75.331 230985 11014
## + Age 1 71.312 230989 11014
## + Var2 1 67.247 230993 11014
## + Var3 1 60.633 231000 11015
## + BoiledWater_num 1 17.516 231043 11015
## + Var4 1 15.696 231045 11015
##
## Call:
## lm(formula = Var1 ~ 1, data = model_data)
##
## Coefficients:
## (Intercept)
## 49.98
#detach("package:MASS", unload = TRUE)
library(dplyr)
# Fit simple regression model
model_var1 <- lm(Var1 ~ Age, data = data_cleaned)
# View summary
summary(model_var1)
##
## Call:
## lm(formula = Var1 ~ Age, data = data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.4912 -6.5245 0.0342 6.6364 27.0106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.681855 0.398006 124.827 <2e-16 ***
## Age 0.007419 0.008598 0.863 0.388
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.786 on 2412 degrees of freedom
## Multiple R-squared: 0.0003086, Adjusted R-squared: -0.0001058
## F-statistic: 0.7446 on 1 and 2412 DF, p-value: 0.3883
predicted_var1 <- predict(model_var1)
plot(
predicted_var1,
data_cleaned$Var1,
xlab = "Actual Var 1",
ylab ="Age",
col="blue",
pch=19,
main ="Actual Var1 vs predicted var1",
)
abline(0,1, col="red", lwd=2)
Interpretation: The actual vs predicted plot shows that
there is no relationship between the response and predictor variable.
Meaning that the linear regression equation presents only intercept
value. I can also by determining the correlation coefficient
round(cor(data_cleaned$Var1, data_cleaned$Age),4)
## [1] 0.0176
The Pearson correlation coefficient shows a very weak positive relationship that is approximately zero. Almost no relationship in the two variables.
par(mfrow=c(2,2))
plot(model_var1)
Interpretation: The linearity,normality, and
homoscedasticity assumptions are satified based on the fitted vs
residuals plot, the QQ plot, and scale location plot, respectively.
Method 1: Simple Linear Regression p_values Since the p_value for Age variable is 0.388 that is greater 0.05. Then there is no statistical significance between Age and Var1.
#Method 2: Pearson Correlation Test
# Run the Pearson correlation test
cor.test(data_cleaned$Age, data_cleaned$Var1)
##
## Pearson's product-moment correlation
##
## data: data_cleaned$Age and data_cleaned$Var1
## t = 0.86293, df = 2412, p-value = 0.3883
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02234283 0.05742266
## sample estimates:
## cor
## 0.01756787
Interpretation: There is a very weak positive correlation between Age and Var1 (r=0.018), but the relationship is not statistically significant (p=0.388>0.05). Therefore, Age and Var1 do not have a meaningful linear relationship.
# Scatter plot using simple scatter chart
plot(data_cleaned$Age, data_cleaned$Var1, col="purple", pch=19, xlab="Age", ylab="Var1",
main= "Scatter plot of Age and Var1 with fitted line")
# Add fitted line
abline(model_var1, col="red", lwd=3)
### Scatter plot by ggplot2 package
library(ggplot2)
# Scatter plot with regression line
ggplot(data_cleaned, aes(x = Age, y = Var1)) +
geom_point(alpha = 0.8, color = "steelblue") + # scatter points
geom_smooth(method = "lm", se = TRUE, color = "red") + # regression line
labs(title = "Relationship between Age and Var1",
x = "Age",
y = "Var1") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### 8.IV. Assess the model assumptions by producing diagnostic plots
(residuals vs ftted,Q-Q plot, etc.)
#Diagnostic plots
plot(model_var1)
model_var2 <- lm(Var2~ Age, data=data_cleaned)
summary(model_var2)
##
## Call:
## lm(formula = Var2 ~ Age, data = data_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.3836 -7.0716 -0.1044 6.9082 27.3366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.286031 0.407495 123.403 <2e-16 ***
## Age -0.004855 0.008803 -0.552 0.581
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.02 on 2412 degrees of freedom
## Multiple R-squared: 0.0001261, Adjusted R-squared: -0.0002884
## F-statistic: 0.3042 on 1 and 2412 DF, p-value: 0.5813
plot(data_cleaned$Var2, data_cleaned$Age,
col="blue",
pch=19,
xlab="Age",
ylab="Var2",
main ="Relationship between Var2 and Age.")
abline(model_var2, col="red", lwd=3)
## Plot of Actual vs fitted
predicted_var2 <-predict(model_var2)
#plot actual vs predicted
plot(
predicted_var2,
data_cleaned$Var2,
xlab= "Actual",
ylab = "Predicted",
main ="Actual vs Predicted plot",
col="blue",
pch = 17,
)
abline(0,1, col = "red", lwd = 3)
The \(R^2\) values across all models are extremely close to zero, meaning none of the predictors explain meaningful variance in the outcome.
The p-values for Age (0.388, and 0.581) and for all other predictors in the full model are well above 0.05, so none are statistically significant.
Between Age predicting Var1 and Age predicting Var2, Var1 ~ Age has a slightly lower p-value (0.388 vs. 0.581) and a marginally higher \(R^{2}\) (0.0003 vs. 0.0001). But both are so weak that neither can be considered a strong predictor.
Therefore, none of the predictors are strong. If you had to choose, Age predicting Var1 is slightly stronger than Age predicting Var2, but the difference is negligible and not statistically meaningful.