Diabetes =read.csv("diabetes.csv")
print(head(Diabetes))
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
column_names <- colnames(Diabetes)
num_columns <- length(column_names)
# column names and the number of columns
cat("Column names of Diabetes are", column_names, "\n","\n")
## Column names of Diabetes are Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
##
cat("Total number of columns is", num_columns)
## Total number of columns is 9
numerical_columns <- sapply(Diabetes, is.numeric) #extraction of the numerical columns
numerical_column_names <- names(numerical_columns)[numerical_columns]
print(numerical_column_names)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
character_columns <- sapply(Diabetes, is.character) # Extraction of the character columns
character_column_names <- names(character_columns)[character_columns]
print(character_column_names)
## character(0)
Diabetes <- na.omit(Diabetes) # data_cleaning
print(head(Diabetes))
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
variety_color = as.numeric(factor(Diabetes$Outcome))
plot(
x = Diabetes$Pregnancies,
y = Diabetes$BloodPressure,
col = variety_color,
pch = 19,
cex = 1.3,
xlab = "Pregnancies",
ylab = "Blood Pressure",
main = "Scatter Plot of Pregnancies vs Blood Pressure",
)
# A legend with more descriptive labels
legend(
"topright",
legend = c("No Diabetes", "Diabetes"), # Descriptive labels
col= 1:2,
pch = 19,
title = "Outcome"
)
##### This scatter plot visualizes the relationship between Pregnancies
and Blood . ##### Each point is colored based on the “Outcome” (0 or 1),
representing different classes. The legend indicates which color
corresponds to each class.
hist(
Diabetes$Insulin,
breaks = 20, # Increase the number of bins for better resolution
main = "Histogram plot of Insulin Levels",
col.main = "#008080",
cex.main = 1.5, # Adjust the size of the title
cex.lab = 1.3,
col.lab = c("Maroon"),
font.main = 1, # Use plain (non-bold) font for the title
xlab = "Insulin",
col = adjustcolor("purple", alpha = 0.7), # Adjust alpha for transparency
border = "white", # White borders for better separation
xlim = c(min(Diabetes$Insulin, na.rm = TRUE), max(Diabetes$Insulin, na.rm = TRUE))
)
# Add axis labels
axis(1, at = pretty(Diabetes$Insulin, n = 10), labels = TRUE, col.axis = "darkgray")
axis(2, labels = TRUE, col.axis = "darkgray")
# Addition of a grid for better readability
grid()
boxplot(
Diabetes$SkinThickness,
ylab = "Skin Thickness",
col = "#48b5c4",
pch = 19,
border = "black",
main = "Boxplot of Skin Thickness"
)
##### This boxplot visualizes the distribution of “SkinThickness”
values. ##### The box represents the interquartile range (IQR), the line
inside the box is the median, and the whiskers extend to the minimum and
maximum values providing insights into the central tendency and spread
of the data. ##### There’s an outlier in the boxplot close to
SKinThickness value, 100.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
# Scatter plot using ggplot2
ggplot(Diabetes, aes(x = Pregnancies, y = Glucose, color = factor(Outcome))) +
geom_point(size=2.8, alpha = 1) +
labs(
x = "Pregnancies",
y = "Glucose",
title = "Scatter Plot of Pregnancies and Glucose",
caption = "Source: Iskulghar",
color = "Outcome"
) +
theme_minimal() +
theme(
text = element_text(color = "darkblue", size = 14), # Adjust font color and size
legend.position = "top", # Place legends at the top
axis.text.x = element_text(color = "#FF7F7F", size = 14),
axis.text.y = element_text(color = "#C4A484", size = 14),
plot.title = element_text(color= "maroon", hjust = 0.5),
legend.title = element_text(color = "darkgreen")
)
##### This scatter plot represents all combinations of the “Glucose” and
“BMI” features, with points colored based on the “Outcome” class (0 or
1). ##### It includes proper axis labels, title, and a caption. Font
colors and sizes are adjusted for readability, and the legend is
positioned at the top.
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.2
# Melting data into long format
melted_data <- melt(Diabetes)
## No id variables; using all as measure variables
# Boxplot of all columns
ggplot(melted_data, aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
labs(
x = "Columns",
y = "Values",
title = "Boxplot of All Columns",
caption = "Source: Iskulghar"
) +
theme_bw() +
theme(
text = element_text(color = "darkblue", size = 10.5),
legend.position = "top",
legend.title = element_text(color = "#FF7F7F", size = 13), # Legend title color and size
plot.title = element_text(color= "maroon", hjust = 0.5)
) +
facet_wrap(~variable, scale= "free") # Creation of a facet grid for each column
##### This boxplot visualizes the distribution of values for all columns
in the “Diabetes” dataset. ##### The data is melted into long format for
easier plotting. Proper axis labels, title, and a caption are provided.
##### Font colors and sizes are adjusted, and the legend is positioned
at the top for clarity.
library(plotly)
## Warning: package 'plotly' was built under R version 4.3.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Diabetes = read.csv("diabetes.csv")
fig = Diabetes %>%
plot_ly(y = ~Pregnancies, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Glucose, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~BloodPressure, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~SkinThickness, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Insulin, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~BMI, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~DiabetesPedigreeFunction, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Age, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Outcome, type = 'violin')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Pregnancies, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Glucose, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~BloodPressure, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~SkinThickness, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Insulin, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~BMI, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~DiabetesPedigreeFunction, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Age, type = 'box')
fig
library(plotly)
fig = Diabetes %>%
plot_ly(y = ~Outcome, type = 'box')
fig
cor_matrix = cor(Diabetes[ ,1:9])
cor_matrix
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.00000000 0.12945867 0.14128198 -0.08167177
## Glucose 0.12945867 1.00000000 0.15258959 0.05732789
## BloodPressure 0.14128198 0.15258959 1.00000000 0.20737054
## SkinThickness -0.08167177 0.05732789 0.20737054 1.00000000
## Insulin -0.07353461 0.33135711 0.08893338 0.43678257
## BMI 0.01768309 0.22107107 0.28180529 0.39257320
## DiabetesPedigreeFunction -0.03352267 0.13733730 0.04126495 0.18392757
## Age 0.54434123 0.26351432 0.23952795 -0.11397026
## Outcome 0.22189815 0.46658140 0.06506836 0.07475223
## Insulin BMI DiabetesPedigreeFunction
## Pregnancies -0.07353461 0.01768309 -0.03352267
## Glucose 0.33135711 0.22107107 0.13733730
## BloodPressure 0.08893338 0.28180529 0.04126495
## SkinThickness 0.43678257 0.39257320 0.18392757
## Insulin 1.00000000 0.19785906 0.18507093
## BMI 0.19785906 1.00000000 0.14064695
## DiabetesPedigreeFunction 0.18507093 0.14064695 1.00000000
## Age -0.04216295 0.03624187 0.03356131
## Outcome 0.13054795 0.29269466 0.17384407
## Age Outcome
## Pregnancies 0.54434123 0.22189815
## Glucose 0.26351432 0.46658140
## BloodPressure 0.23952795 0.06506836
## SkinThickness -0.11397026 0.07475223
## Insulin -0.04216295 0.13054795
## BMI 0.03624187 0.29269466
## DiabetesPedigreeFunction 0.03356131 0.17384407
## Age 1.00000000 0.23835598
## Outcome 0.23835598 1.00000000
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.3.2
ggcorrplot(cor_matrix,
type = "lower",
colors = c("purple", "white", "red"),
lab = TRUE)
colnames(Diabetes) <- c("Prg", "Glu", "BP", "Skthic", "Ins", "BMI", "DPF", "Age", "OC")
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Diabetes$OC <- factor(Diabetes$OC)
# Use the label_mapping in the ggpairs plot
ggpairs(
Diabetes,
aes(colour = OC),
lower = list(
combo = wrap("facethist", bins = 30),
continuous = wrap("points", size = 2.5)
),
upper = list(
combo = wrap("facetdensity", bins = 30),
continuous = wrap("cor", size = 2.5)
)
) +
theme(
axis.text.x = element_blank(), # Remove x-axis labels
axis.text.y = element_blank(), # Remove y-axis labels
axis.title = element_blank(), # Remove axis titles
axis.text = element_text(size = 6) # Adjust overall text size
)
## Warning in stat_density(aes(y = after_stat(!!as.name("scaled")) * diff(range(x,
## : Ignoring unknown parameters: `bins`
## Warning in stat_density(aes(y = after_stat(!!as.name("scaled")) * diff(range(x, : Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
library(stats)
Diabetes_pca = prcomp(Diabetes[ , -9], scale = TRUE, center = TRUE)
Diabetes_pca
## Standard deviations (1, .., p=8):
## [1] 1.4471973 1.3157546 1.0147068 0.9356971 0.8731234 0.8262133 0.6479322
## [8] 0.6359733
##
## Rotation (n x k) = (8 x 8):
## PC1 PC2 PC3 PC4 PC5 PC6
## Prg -0.1284321 0.5937858 -0.01308692 0.08069115 -0.4756057 0.193598168
## Glu -0.3930826 0.1740291 0.46792282 -0.40432871 0.4663280 0.094161756
## BP -0.3600026 0.1838921 -0.53549442 0.05598649 0.3279531 -0.634115895
## Skthic -0.4398243 -0.3319653 -0.23767380 0.03797608 -0.4878621 0.009589438
## Ins -0.4350262 -0.2507811 0.33670893 -0.34994376 -0.3469348 -0.270650609
## BMI -0.4519413 -0.1009598 -0.36186463 0.05364595 0.2532038 0.685372179
## DPF -0.2706114 -0.1220690 0.43318905 0.83368010 0.1198105 -0.085784088
## Age -0.1980271 0.6205885 0.07524755 0.07120060 -0.1092900 -0.033357170
## PC7 PC8
## Prg -0.58879003 -0.117840984
## Glu -0.06015291 -0.450355256
## BP -0.19211793 0.011295538
## Skthic 0.28221253 -0.566283799
## Ins -0.13200992 0.548621381
## BMI -0.03536644 0.341517637
## DPF -0.08609107 0.008258731
## Age 0.71208542 0.211661979
summary(Diabetes_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.4472 1.3158 1.0147 0.9357 0.87312 0.82621 0.64793
## Proportion of Variance 0.2618 0.2164 0.1287 0.1094 0.09529 0.08533 0.05248
## Cumulative Proportion 0.2618 0.4782 0.6069 0.7163 0.81164 0.89697 0.94944
## PC8
## Standard deviation 0.63597
## Proportion of Variance 0.05056
## Cumulative Proportion 1.00000
pc_12 = data.frame(Diabetes_pca$x[ , 1:2])
head(pc_12)
pc_12_outcome = cbind(pc_12, Outcome = Diabetes$OC)
pc_12_outcome
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.3.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_eig(Diabetes_pca, addlabels = TRUE)
#### f.ii) Contribution plot of PCs
library(factoextra)
# Plot PCA variable factor map
fviz_pca_var(
Diabetes_pca,
col.var = "contrib",
repel = TRUE,
ggtheme = theme_minimal()
)
library(factoextra)
contrib_matrix <- get_pca_var(Diabetes_pca)$contrib
heatmap(contrib_matrix,
Colv = NA, # Do not cluster columns
Rowv = NA, # Do not cluster rows
main = "Contribution Plot - Heatmap",
xlab = "Principal Components",
ylab = "Variables",
col = viridis::viridis(10), # Use "viridis" color palette
scale = "column", # Scale by column
margins = c(5, 10)) # Adjust margins
fviz_pca_ind(Diabetes_pca,
geom.ind = "point",
col.ind = Diabetes$OC,
addEllipses = TRUE)
library(lattice)
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.2
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
train_ix = createDataPartition(Diabetes$OC, p = 0.8, list = FALSE)
train_data = Diabetes[train_ix, ]
test_data = Diabetes[-train_ix, ]
train_data
test_data
Diabetes$OC = as.factor(Diabetes$OC)
svm_model = svm(OC ~ Prg+Glu+BP+Skthic+Ins+BMI+DPF
+Age, data = train_data, kernel = "linear")
predictions = predict(svm_model, newdata = test_data)
confusion_mat = confusionMatrix(predictions, test_data$OC)
confusion_mat
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 92 24
## 1 8 29
##
## Accuracy : 0.7908
## 95% CI : (0.7178, 0.8523)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.0001499
##
## Kappa : 0.5028
##
## Mcnemar's Test P-Value : 0.0080099
##
## Sensitivity : 0.9200
## Specificity : 0.5472
## Pos Pred Value : 0.7931
## Neg Pred Value : 0.7838
## Prevalence : 0.6536
## Detection Rate : 0.6013
## Detection Prevalence : 0.7582
## Balanced Accuracy : 0.7336
##
## 'Positive' Class : 0
##
com = as.data.frame(confusion_mat$table)
ggplot(com, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq)) +
scale_fill_gradient(low="lightblue", high="maroon")
US_Admission <- read.csv("US Admission.csv")
# Remove the "Serial No" column
US_Admission <- US_Admission[, -1]
colnames(US_Admission) <- c("GRE", "TOEFL", "Rating", "SOP", "LOR", "CGPA", "Res", "Chance")
library(ggplot2)
library(GGally)
library(ggplot2)
library(GGally)
# Convert the "Res" variable to a factor
US_Admission$Res <- factor(US_Admission$Res)
# Check for missing values
if (any(is.na(US_Admission))) {
# Remove rows with missing values
US_Admission <- na.omit(US_Admission)
}
# Shorten column names
colnames(US_Admission) <- c("GRE", "TOEFL", "Rating", "SOP", "LOR", "CGPA", "Res", "Chance")
# Create a pair plot with ggplot2
ggpairs(
US_Admission,
aes(colour = Res),
lower = list(
combo = wrap("facethist", bins = 30),
continuous = wrap("points", size = 2.5)
),
upper = list(
combo = wrap("facetdensity", bins = 30),
continuous = wrap("cor", size = 2.5)
)
) +
theme(
axis.text.x = element_blank(), # Remove x-axis labels
axis.text.y = element_blank(), # Remove y-axis labels
axis.title = element_blank(), # Remove axis titles
axis.text = element_text(size = 6) # Adjust overall text size
)
## Warning in stat_density(aes(y = after_stat(!!as.name("scaled")) * diff(range(x, : Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
## Ignoring unknown parameters: `bins`
library(ggplot2)
US_Admission$Res = as.numeric(US_Admission$Res)
# Loop through each feature (excluding the target variable "Chance of Admit")
for (feature in setdiff(names(US_Admission), "Chance")) {
# Create linear regression plot
linear_plot <- ggplot(US_Admission, aes_string(x = "Chance", y = feature)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add linear regression line
labs(title = paste("Linear Regression -", feature),
x = "Chance of Admit",
y = feature) +
theme_minimal()
# Print the plot
print(linear_plot)
# Summarize linear regression model
lm_model <- lm(paste(feature, "~ Chance"), data = US_Admission)
lm_summary <- summary(lm_model)
# Print regression summary
cat("\nLinear Regression Summary for", feature, ":\n")
print(lm_summary)
# Interpretation
cat("\nInterpretation for", feature, ":\n")
if (lm_summary$coefficients[2, 4] < 0.05) {
cat("The variable is a significant predictor of 'Chance of Admit'.\n")
if (lm_summary$coefficients[2, 1] > 0) {
cat("As", feature, "increases, 'Chance of Admit' tends to increase.\n")
} else {
cat("As", feature, "increases, 'Chance of Admit' tends to decrease.\n")
}
} else {
cat("The variable is not a significant predictor of 'Chance of Admit'.\n")
}
cat("\n--------------------------------------\n")
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
##
## Linear Regression Summary for GRE :
##
## Call:
## lm(formula = paste(feature, "~ Chance"), data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.5894 -4.7205 0.8904 4.4297 23.9084
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 270.033 1.776 152.06 <2e-16 ***
## Chance 64.574 2.406 26.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.853 on 398 degrees of freedom
## Multiple R-squared: 0.6442, Adjusted R-squared: 0.6433
## F-statistic: 720.6 on 1 and 398 DF, p-value: < 2.2e-16
##
##
## Interpretation for GRE :
## The variable is a significant predictor of 'Chance of Admit'.
## As GRE increases, 'Chance of Admit' tends to increase.
##
## --------------------------------------
## `geom_smooth()` using formula = 'y ~ x'
##
## Linear Regression Summary for TOEFL :
##
## Call:
## lm(formula = paste(feature, "~ Chance"), data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6111 -2.5896 0.0145 2.4452 11.4961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0062 0.9623 86.25 <2e-16 ***
## Chance 33.6906 1.3036 25.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.713 on 398 degrees of freedom
## Multiple R-squared: 0.6266, Adjusted R-squared: 0.6257
## F-statistic: 667.9 on 1 and 398 DF, p-value: < 2.2e-16
##
##
## Interpretation for TOEFL :
## The variable is a significant predictor of 'Chance of Admit'.
## As TOEFL increases, 'Chance of Admit' tends to increase.
##
## --------------------------------------
## `geom_smooth()` using formula = 'y ~ x'
##
## Linear Regression Summary for Rating :
##
## Call:
## lm(formula = paste(feature, "~ Chance"), data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.40494 -0.57607 -0.06269 0.56478 2.64858
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.0444 0.2086 -5.006 8.35e-07 ***
## Chance 5.7042 0.2826 20.186 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.805 on 398 degrees of freedom
## Multiple R-squared: 0.5059, Adjusted R-squared: 0.5046
## F-statistic: 407.5 on 1 and 398 DF, p-value: < 2.2e-16
##
##
## Interpretation for Rating :
## The variable is a significant predictor of 'Chance of Admit'.
## As Rating increases, 'Chance of Admit' tends to increase.
##
## --------------------------------------
## `geom_smooth()` using formula = 'y ~ x'
##
## Linear Regression Summary for SOP :
##
## Call:
## lm(formula = paste(feature, "~ Chance"), data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8315 -0.4976 0.0024 0.5029 3.2429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.05579 0.19259 -0.29 0.772
## Chance 4.77089 0.26088 18.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7431 on 398 degrees of freedom
## Multiple R-squared: 0.4566, Adjusted R-squared: 0.4552
## F-statistic: 334.4 on 1 and 398 DF, p-value: < 2.2e-16
##
##
## Interpretation for SOP :
## The variable is a significant predictor of 'Chance of Admit'.
## As SOP increases, 'Chance of Admit' tends to increase.
##
## --------------------------------------
## `geom_smooth()` using formula = 'y ~ x'
##
## Linear Regression Summary for LOR :
##
## Call:
## lm(formula = paste(feature, "~ Chance"), data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.31398 -0.44060 0.03334 0.49598 1.86129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3954 0.1731 2.284 0.0229 *
## Chance 4.2205 0.2345 18.000 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6679 on 398 degrees of freedom
## Multiple R-squared: 0.4488, Adjusted R-squared: 0.4474
## F-statistic: 324 on 1 and 398 DF, p-value: < 2.2e-16
##
##
## Interpretation for LOR :
## The variable is a significant predictor of 'Chance of Admit'.
## As LOR increases, 'Chance of Admit' tends to increase.
##
## --------------------------------------
## `geom_smooth()` using formula = 'y ~ x'
##
## Linear Regression Summary for CGPA :
##
## Call:
## lm(formula = paste(feature, "~ Chance"), data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.89091 -0.16933 0.01528 0.16943 1.00290
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.95386 0.07538 78.98 <2e-16 ***
## Chance 3.65163 0.10212 35.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2909 on 398 degrees of freedom
## Multiple R-squared: 0.7626, Adjusted R-squared: 0.762
## F-statistic: 1279 on 1 and 398 DF, p-value: < 2.2e-16
##
##
## Interpretation for CGPA :
## The variable is a significant predictor of 'Chance of Admit'.
## As CGPA increases, 'Chance of Admit' tends to increase.
##
## --------------------------------------
## `geom_smooth()` using formula = 'y ~ x'
##
## Linear Regression Summary for Res :
##
## Call:
## lm(formula = paste(feature, "~ Chance"), data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.86774 -0.38443 0.04087 0.31109 1.15687
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1472 0.1077 1.366 0.173
## Chance 1.9332 0.1459 13.248 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4157 on 398 degrees of freedom
## Multiple R-squared: 0.306, Adjusted R-squared: 0.3043
## F-statistic: 175.5 on 1 and 398 DF, p-value: < 2.2e-16
##
##
## Interpretation for Res :
## The variable is a significant predictor of 'Chance of Admit'.
## As Res increases, 'Chance of Admit' tends to increase.
##
## --------------------------------------
Significant Predictor: If the p-value for the coefficient of the feature is less than 0.05, it indicates that the variable is a significant predictor of “Chance of Admit.”
Positive/Negative Relationship: The sign of the coefficient indicates the direction of the relationship. A positive coefficient suggests that as the feature increases, “Chance of Admit” tends to increase, and vice versa.
library(ggplot2)
# Loop through each feature (excluding the target variable "Chance of Admit")
for (feature in setdiff(names(US_Admission), "Chance")) {
# Create polynomial regression plot
poly_plot <- ggplot(US_Admission, aes_string(x = "Chance", y = feature)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "blue") + # Add polynomial regression line
labs(title = paste("Polynomial Regression (Power 2) -", feature),
x = "Chance of Admit",
y = feature) +
theme_minimal()
# Print the plot
print(poly_plot)
}
Curvature: Polynomial regression of power 2 allows for a curve in the regression line. It can capture non-linear relationships between the feature and “Chance of Admit.”
Significant Deviation from Linearity: If the relationship between the feature and “Chance of Admit” is not well captured by a straight line, the polynomial regression of power 2 can provide a better fit.
Overfitting Warning: Polynomial regression of higher degrees may lead to overfitting. It’s essential to evaluate the model’s performance and consider a balance between complexity and goodness of fit.
Analyze each plot individually to understand the relationship between each feature and “Chance of Admit” when a quadratic (power 2) relationship is considered. Consider both the visual fit of the curve and the practical significance of the results.
# Create a regression model using all features
regression_model <- lm(Chance ~ ., data = US_Admission)
# Print the summary of the regression model
summary(regression_model)
##
## Call:
## lm(formula = Chance ~ ., data = US_Admission)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.26259 -0.02103 0.01005 0.03628 0.15928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.2839576 0.1217507 -10.546 < 2e-16 ***
## GRE 0.0017374 0.0005979 2.906 0.00387 **
## TOEFL 0.0029196 0.0010895 2.680 0.00768 **
## Rating 0.0057167 0.0047704 1.198 0.23150
## SOP -0.0033052 0.0055616 -0.594 0.55267
## LOR 0.0223531 0.0055415 4.034 6.6e-05 ***
## CGPA 0.1189395 0.0122194 9.734 < 2e-16 ***
## Res 0.0245251 0.0079598 3.081 0.00221 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06378 on 392 degrees of freedom
## Multiple R-squared: 0.8035, Adjusted R-squared: 0.8
## F-statistic: 228.9 on 7 and 392 DF, p-value: < 2.2e-16
# Print the coefficients and their significance
cat("\nCoefficients:\n")
##
## Coefficients:
print(coef(regression_model))
## (Intercept) GRE TOEFL Rating SOP LOR
## -1.283957585 0.001737412 0.002919577 0.005716658 -0.003305169 0.022353127
## CGPA Res
## 0.118939454 0.024525106
cat("\nSignificant Predictors:\n")
##
## Significant Predictors:
significant_predictors <- names(summary(regression_model)$coefficients[,"Pr(>|t|)"] < 0.05)
print(significant_predictors)
## [1] "(Intercept)" "GRE" "TOEFL" "Rating" "SOP"
## [6] "LOR" "CGPA" "Res"
Coefficients: The coefficients in the summary represent the estimated effect of each predictor variable on the target variable (“Chance of Admit”). A positive coefficient indicates a positive relationship, while a negative coefficient indicates a negative relationship.
P-values: The p-values associated with the coefficients test the null hypothesis that each coefficient is equal to zero. A low p-value (typically less than 0.05) suggests that the corresponding predictor variable is a significant predictor of the target variable.
Adjusted R-squared: The adjusted R-squared value takes into account the number of predictors in the model and provides an indication of the goodness of fit. A higher adjusted R-squared suggests a better fit.
Residuals: The residuals represent the differences between the observed and predicted values. Check for patterns in the residuals to assess the model’s assumptions.