Load & see the data
library(readxl)
df <- read_excel("C:\\Users\\Dell\\OneDrive\\Desktop\\STATISTICS\\2nd Year\\R programming\\varsity\\Project_Distribution.xlsx")
# View(df)
summary(df)
ExamRoll Sex CGPA Group
Min. :2316601 Length:80 Min. :2.053 Min. :1.000
1st Qu.:2316622 Class :character 1st Qu.:3.242 1st Qu.:1.000
Median :2316643 Mode :character Median :3.484 Median :2.000
Mean :2316642 Mean :3.415 Mean :2.275
3rd Qu.:2316662 3rd Qu.:3.769 3rd Qu.:3.000
Max. :2316682 Max. :3.988 Max. :4.000
AssignedTeacher
Length:80
Class :character
Mode :character
sum(is.na(df))
[1] 0
convert the classes
df$ExamRoll<-as.character(df$ExamRoll)
df$Sex <- as.factor(df$Sex)
df$Group <- as.factor(df$Group)
df$AssignedTeacher <- as.factor(df$AssignedTeacher)
class(df$AssignedTeacher)
[1] "factor"
See Data again
summary(df)
ExamRoll Sex CGPA Group AssignedTeacher
Length:80 F:30 Min. :2.053 1:23 A : 4
Class :character M:50 1st Qu.:3.242 2:23 B : 4
Mode :character Median :3.484 3:23 C : 4
Mean :3.415 4:11 D : 4
3rd Qu.:3.769 E : 4
Max. :3.988 FT : 4
(Other):56
Descriptive Statistics For CGPA (i) Mean (ii) Median (iii) Standard deviation
library(psych)
describe(df$CGPA)
describe(df$CGPA)$mean
[1] 3.415225
Frequency Distributions (i) Sex (ii) Group (iii) AssignedTeacher
table(df$Sex)
F M
30 50
table(df$Group)
1 2 3 4
23 23 23 11
table(df$AssignedTeacher)
A B C D E FT G H I J K L MT N O P Q R S T U V W
4 4 4 4 4 4 4 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3
barplot(table(df$Sex))
barplot(table(df$Group))
barplot(table(df$AssignedTeacher))
Group Comparisons (i) CGPA by Sex (mean, SD, count) (ii) CGPA by Group (mean, SD, count)
library('dplyr')
# CGPA by Sex
result_sx <- df %>%
group_by(Sex) %>%
summarise(
Count = n(),
Mean_CGPA = round(mean(CGPA, na.rm = TRUE), 2),
SD_CGPA = round(sd(CGPA, na.rm = TRUE), 2),
.groups = 'drop'
)
result_sx
# CGPA by Group
result_grp <- df %>%
group_by(Group) %>%
summarise(
Count = n(),
Mean_CGPA = round(mean(CGPA, na.rm = TRUE), 2),
SD_CGPA = round(sd(CGPA, na.rm = TRUE), 2),
.groups = 'drop'
)
result_grp
NA
Distribution Plots (i) Histogram of CGPA (ii) Density plot of CGPA
library(ggplot2)
# Histogram of CGPA
ggplot(df, aes(x=CGPA)) +
geom_histogram(bins=30, fill='Steelblue', color = 'white') +
labs(title = "Histogram of CGPA", x="CGPA", y="Count") +
theme_minimal()
# Density plot of CGPA
ggplot(df, aes(x=CGPA)) +
geom_density(fill='Steelblue') +
labs(title = "Density plot of CGPA", x="CGPA", y="Density") +
theme_minimal()
Categorical Data Plots: (i) Bar plot of Sex (ii) Bar plot of Group (iii) Bar plot of AssignedTeacher
# Barplot of Sex
ggplot(df, aes(x=Sex, fill = Sex)) +
geom_bar() +
labs(title = 'Bar plot of Sex', x= "Sex", y="Count")
# Barplot of Group
ggplot(df, aes(x=Group, fill = Group)) +
geom_bar() +
labs(title = 'Bar plot of Group', x= "Group", y="Count")
# Barplot of AssignedTeacher
ggplot(df, aes(x=AssignedTeacher, fill = AssignedTeacher)) +
geom_bar() +
labs(title = 'Bar plot of AssignedTeacher', x= "Teacher", y="Count")
# Boxplot of CGPA by Sex
ggplot(df, aes(y= CGPA, x= Sex, fill= Sex)) +
geom_boxplot(outliers= TRUE, outlier.color = "Red") +
stat_summary(fun = mean,
fill = "yellow",
geom = "point",
shape = 23,
size = 3)
# Boxplot of CGPA by Group
ggplot(df, aes(y= CGPA, x= Group, fill= Group)) +
geom_boxplot(outliers= TRUE, outlier.color = "Red") +
stat_summary(fun = mean,
geom = "point",
shape = 23,
size = 3,
fill = "yellow")
Mean Comparisons
# t-test: CGPA by Sex
t.test(df$CGPA ~ df$Sex)
Welch Two Sample t-test
data: df$CGPA by df$Sex
t = 1.7854, df = 70.874, p-value = 0.07848
alternative hypothesis: true difference in means between group F and group M is not equal to 0
95 percent confidence interval:
-0.01995516 0.36152849
sample estimates:
mean in group F mean in group M
3.521967 3.351180
# ANOVA: CGPA by Group
anova(lm(CGPA ~ Group, data = df))
Analysis of Variance Table
Response: CGPA
Df Sum Sq Mean Sq F value Pr(>F)
Group 3 13.6697 4.5566 203.13 < 2.2e-16 ***
Residuals 76 1.7048 0.0224
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# ANOVA: CGPA by AssignedTeacher
anova(lm(CGPA ~ AssignedTeacher, data = df))
Analysis of Variance Table
Response: CGPA
Df Sum Sq Mean Sq F value Pr(>F)
AssignedTeacher 22 1.780 0.08091 0.3392 0.9966
Residuals 57 13.595 0.23850
Association Tests
# Chi-square test: Sex vs Group
chisq.test(table(df$Sex, df$Group))
Pearson's Chi-squared test
data: table(df$Sex, df$Group)
X-squared = 4.0095, df = 3, p-value = 0.2604
# Chi-square test: Sex vs AssignedTeacher
chisq.test(table(df$Sex, df$AssignedTeacher))
Pearson's Chi-squared test
data: table(df$Sex, df$AssignedTeacher)
X-squared = 25.244, df = 22, p-value = 0.2855
Regression Analysis Simple Linear Regression Model 1: CGPA ~ Sex Model 2: CGPA ~ Group Multiple Regression Full model: CGPA ~ Sex + Group + AssignedTeacher
model1 <- lm(df$CGPA ~ df$Sex)
model2 <- lm(df$CGPA ~ df$Group)
full_model <- lm(df$CGPA ~ df$Sex + df$Group + df$AssignedTeacher)
library(performance)
check_model(model1)
check_model(model2)
check_model(full_model)
summary(model1)
Call:
lm(formula = df$CGPA ~ df$Sex)
Residuals:
Min 1Q Median 3Q Max
-1.29818 -0.22513 0.09743 0.28478 0.60182
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.5220 0.0796 44.244 <2e-16 ***
df$SexM -0.1708 0.1007 -1.696 0.0938 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.436 on 78 degrees of freedom
Multiple R-squared: 0.03557, Adjusted R-squared: 0.02321
F-statistic: 2.877 on 1 and 78 DF, p-value: 0.09384
summary(model2)
Call:
lm(formula = df$CGPA ~ df$Group)
Residuals:
Min 1Q Median 3Q Max
-0.48527 -0.07246 0.00333 0.10026 0.44173
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.83774 0.03123 122.887 < 2e-16 ***
df$Group2 -0.26913 0.04417 -6.094 4.23e-08 ***
df$Group3 -0.57900 0.04417 -13.110 < 2e-16 ***
df$Group4 -1.29947 0.05491 -23.668 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1498 on 76 degrees of freedom
Multiple R-squared: 0.8891, Adjusted R-squared: 0.8847
F-statistic: 203.1 on 3 and 76 DF, p-value: < 2.2e-16
summary(full_model)
Call:
lm(formula = df$CGPA ~ df$Sex + df$Group + df$AssignedTeacher)
Residuals:
Min 1Q Median 3Q Max
-0.37629 -0.07189 0.01049 0.06907 0.29031
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.697292 0.079024 46.787 < 2e-16 ***
df$SexM 0.053618 0.042178 1.271 0.2092
df$Group2 -0.278455 0.043500 -6.401 4.14e-08 ***
df$Group3 -0.588325 0.043500 -13.525 < 2e-16 ***
df$Group4 -1.321622 0.058344 -22.652 < 2e-16 ***
df$AssignedTeacherB 0.225250 0.102817 2.191 0.0329 *
df$AssignedTeacherC 0.183846 0.103356 1.779 0.0810 .
df$AssignedTeacherD 0.066154 0.103356 0.640 0.5249
df$AssignedTeacherE 0.093191 0.104957 0.888 0.3786
df$AssignedTeacherFT 0.033096 0.103356 0.320 0.7501
df$AssignedTeacherG 0.041346 0.103356 0.400 0.6907
df$AssignedTeacherH 0.130846 0.103356 1.266 0.2111
df$AssignedTeacherI 0.007691 0.104957 0.073 0.9419
df$AssignedTeacherJ 0.260404 0.103356 2.519 0.0148 *
df$AssignedTeacherK 0.264250 0.102817 2.570 0.0130 *
df$AssignedTeacherL 0.005017 0.114247 0.044 0.9651
df$AssignedTeacherMT 0.202096 0.111863 1.807 0.0765 .
df$AssignedTeacherN -0.007777 0.112184 -0.069 0.9450
df$AssignedTeacherO 0.175557 0.112184 1.565 0.1236
df$AssignedTeacherP 0.143302 0.113299 1.265 0.2115
df$AssignedTeacherQ 0.021557 0.112184 0.192 0.8484
df$AssignedTeacherR 0.193096 0.111863 1.726 0.0901 .
df$AssignedTeacherS 0.087351 0.114247 0.765 0.4479
df$AssignedTeacherT 0.113557 0.112184 1.012 0.3160
df$AssignedTeacherU 0.023890 0.112184 0.213 0.8322
df$AssignedTeacherV 0.165351 0.114247 1.447 0.1537
df$AssignedTeacherW 0.211429 0.111863 1.890 0.0642 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1454 on 53 degrees of freedom
Multiple R-squared: 0.9271, Adjusted R-squared: 0.8914
F-statistic: 25.93 on 26 and 53 DF, p-value: < 2.2e-16
Outlier Detection
# Detect outliers using IQR rule
Q1 <- quantile(df$CGPA, 0.25)
Q3 <- quantile(df$CGPA, 0.75)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
outliers <- df[df$CGPA < lower | df$CGPA > upper, ]
outliers
boxplot(df$CGPA, col="steelblue", main = "CGPA boxplot")
Regression Diagnostics
plot(full_model)
shapiro.test(residuals(full_model))
Shapiro-Wilk normality test
data: residuals(full_model)
W = 0.97797, p-value = 0.1824
library(car)
vif(full_model)
GVIF Df GVIF^(1/(2*Df))
df$Sex 1.577656 1 1.256048
df$Group 1.241791 3 1.036752
df$AssignedTeacher 1.723375 22 1.012447