mydata<-read.csv("student_exam_scores.csv", header=TRUE, sep=",", dec=".")
head(mydata)
## student_id hours_studied sleep_hours attendance_percent previous_scores
## 1 S001 8.0 8.8 72.1 45
## 2 S002 1.3 8.6 60.7 55
## 3 S003 4.0 8.2 73.7 86
## 4 S004 3.5 4.8 95.1 66
## 5 S005 9.1 6.4 89.8 71
## 6 S006 8.4 5.1 58.5 75
## exam_score
## 1 30.2
## 2 25.0
## 3 35.8
## 4 34.0
## 5 40.3
## 6 35.7
# Create a new variable: study/sleep ratio
mydata$study_sleep_ratio <- mydata$hours_studied / mydata$sleep_hours
# Create a categorical variable for attendance
mydata$attendance_cat <- ifelse(mydata$attendance_percent >= 75, "High", "Low")
# Rename variable
colnames(mydata)[colnames(mydata) == "previous_scores"] <- "past_scores"
# Create a subset with students who studied more than 5 hours
mydata2 <- subset(mydata, hours_studied > 5)
# Remove students with attendance < 60%
mydata2 <- mydata2[mydata2$attendance_percent >= 60, ]
head(mydata2)
## student_id hours_studied sleep_hours attendance_percent past_scores
## 1 S001 8.0 8.8 72.1 45
## 5 S005 9.1 6.4 89.8 71
## 9 S009 5.6 5.9 81.6 84
## 12 S012 6.6 7.9 87.6 85
## 15 S015 8.1 8.8 60.0 90
## 18 S018 7.5 7.6 73.8 58
## exam_score study_sleep_ratio attendance_cat
## 1 30.2 0.9090909 Low
## 5 40.3 1.4218750 High
## 9 34.7 0.9491525 High
## 12 35.1 0.8354430 High
## 15 41.1 0.9204545 Low
## 18 36.3 0.9868421 Low
# Summary statistics
summary(mydata[, c("hours_studied", "sleep_hours", "exam_score")])
## hours_studied sleep_hours exam_score
## Min. : 1.000 Min. :4.000 Min. :17.10
## 1st Qu.: 3.500 1st Qu.:5.300 1st Qu.:29.50
## Median : 6.150 Median :6.700 Median :34.05
## Mean : 6.325 Mean :6.622 Mean :33.95
## 3rd Qu.: 9.000 3rd Qu.:8.025 3rd Qu.:38.75
## Max. :12.000 Max. :9.000 Max. :51.30
# Mean, median and standard deviation for exam scores
mean(mydata$exam_score)
## [1] 33.955
median(mydata$exam_score)
## [1] 34.05
sd(mydata$exam_score)
## [1] 6.789548
library(ggplot2)
# Histogram of exam scores
ggplot(mydata, aes(x = exam_score)) +
geom_histogram(binwidth = 5, fill="skyblue", color="black") +
labs(title="Distribution of Exam Scores", x="Exam Score", y="Frequency")
# Scatterplot: hours studied vs exam score
ggplot(mydata, aes(x = hours_studied, y = exam_score)) +
geom_point(color="blue") +
geom_smooth(method="lm", se=FALSE, color="red") +
labs(title="Hours Studied vs Exam Score", x="Hours Studied", y="Exam Score")
## `geom_smooth()` using formula = 'y ~ x'
# Boxplot: exam scores by attendance category
ggplot(mydata, aes(x = attendance_cat, y = exam_score, fill=attendance_cat)) +
geom_boxplot() +
labs(title="Exam Scores by Attendance Category", x="Attendance Category", y="Exam Score")
library(readxl)
mydata3 <- read_xlsx("Business School.xlsx")
mydata3 <- as.data.frame(mydata3)
head(mydata3)
## Student ID Undergrad Degree Undergrad Grade MBA Grade Work Experience
## 1 1 Business 68.4 90.2 No
## 2 2 Computer Science 70.2 68.7 Yes
## 3 3 Finance 76.4 83.3 No
## 4 4 Business 82.6 88.7 No
## 5 5 Finance 76.9 75.4 No
## 6 6 Computer Science 83.3 82.1 No
## Employability (Before) Employability (After) Status Annual Salary
## 1 252 276 Placed 111000
## 2 101 119 Placed 107000
## 3 401 462 Placed 109000
## 4 287 342 Placed 148000
## 5 275 347 Placed 255500
## 6 254 313 Placed 103500
library(ggplot2)
ggplot(mydata3, aes(x = `Undergrad Degree`, fill = `Undergrad Degree`)) +
geom_bar() +
labs(title = "Distribution of Undergraduate Degrees",
x = "Undergraduate Degree",
y = "Count") +
theme_minimal()
summary(mydata3$`Annual Salary`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20000 87125 103500 109058 124000 340000
mean(mydata3$`Annual Salary`)
## [1] 109058
median(mydata3$`Annual Salary`)
## [1] 103500
sd(mydata3$`Annual Salary`)
## [1] 41501.49
ggplot(mydata3, aes(x = `Annual Salary`)) +
geom_histogram(binwidth = 20000, fill = "lightblue", color = "black") +
labs(title = "Distribution of Annual Salary",
x = "Annual Salary",
y = "Frequency") +
theme_minimal()
MBA Gradegrades <- mydata3$`MBA Grade`
t_test_result <- t.test(grades, mu = 74)
print(t_test_result)
##
## One Sample t-test
##
## data: grades
## t = 2.6587, df = 99, p-value = 0.00915
## alternative hypothesis: true mean is not equal to 74
## 95 percent confidence interval:
## 74.51764 77.56346
## sample estimates:
## mean of x
## 76.04055
cohen_d <- (mean(grades) - 74) / sd(grades)
print(cohen_d)
## [1] 0.2658658
library(readxl)
mydata4 <- read_xlsx("C:/Users/Korisnik/Desktop/domaci/Apartments.xlsx")
mydata4 <- as.data.frame(mydata4)
head(mydata4)
## Age Distance Price Parking Balcony
## 1 7 28 1640 0 1
## 2 18 1 2800 1 0
## 3 7 28 1660 0 0
## 4 28 29 1850 0 1
## 5 18 18 1640 1 1
## 6 28 12 1770 0 1
Description:
mydata4$Parking <- as.factor(mydata4$Parking)
mydata4$Balcony <- as.factor(mydata4$Balcony)
str(mydata4)
## 'data.frame': 85 obs. of 5 variables:
## $ Age : num 7 18 7 28 18 28 14 18 22 25 ...
## $ Distance: num 28 1 28 29 18 12 20 6 7 2 ...
## $ Price : num 1640 2800 1660 1850 1640 1770 1850 1970 2270 2570 ...
## $ Parking : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 2 2 2 ...
## $ Balcony : Factor w/ 2 levels "0","1": 2 1 1 2 2 2 2 2 1 1 ...
t_test_result <- t.test(mydata4$Price, mu = 1900)
t_test_result
##
## One Sample t-test
##
## data: mydata4$Price
## t = 2.9022, df = 84, p-value = 0.004731
## alternative hypothesis: true mean is not equal to 1900
## 95 percent confidence interval:
## 1937.443 2100.440
## sample estimates:
## mean of x
## 2018.941
mean(mydata4$Price)
## [1] 2018.941
fit1 <- lm(Price ~ Age, data = mydata4)
summary(fit1)
##
## Call:
## lm(formula = Price ~ Age, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -623.9 -278.0 -69.8 243.5 776.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2185.455 87.043 25.108 <2e-16 ***
## Age -8.975 4.164 -2.156 0.034 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 369.9 on 83 degrees of freedom
## Multiple R-squared: 0.05302, Adjusted R-squared: 0.04161
## F-statistic: 4.647 on 1 and 83 DF, p-value: 0.03401
# Correlation
cor(mydata4$Price, mydata4$Age)
## [1] -0.230255
pairs(~ Price + Age + Distance, data = mydata4, main = "Scatterplot Matrix")
fit2 <- lm(Price ~ Age + Distance, data = mydata4)
summary(fit2)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -603.23 -219.94 -85.68 211.31 689.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2460.101 76.632 32.10 < 2e-16 ***
## Age -7.934 3.225 -2.46 0.016 *
## Distance -20.667 2.748 -7.52 6.18e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 286.3 on 82 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4259
## F-statistic: 32.16 on 2 and 82 DF, p-value: 4.896e-11
library(car)
## Loading required package: carData
vif(fit2)
## Age Distance
## 1.001845 1.001845
# Standardized residuals
std_resid <- rstandard(fit2)
# Cook's distance
cooks_d <- cooks.distance(fit2)
# Identify potential outliers
which(abs(std_resid) > 2)
## 33 38 53
## 33 38 53
which(cooks_d > 4/length(cooks_d))
## 22 33 38 53 55
## 22 33 38 53 55
std_fit <- scale(fitted(fit2))
plot(std_fit, std_resid,
xlab = "Standardized Fitted Values",
ylab = "Standardized Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red")
# Histogram
hist(std_resid, main = "Histogram of Standardized Residuals", xlab = "Std Residuals")
# Q-Q plot
qqnorm(std_resid)
qqline(std_resid, col = "red")
# Shapiro-Wilk test
shapiro.test(std_resid)
##
## Shapiro-Wilk normality test
##
## data: std_resid
## W = 0.95306, p-value = 0.00366
# Remove problematic units (if any found earlier)
clean_data <- mydata4[abs(std_resid) <= 2 & cooks_d <= 4/length(cooks_d), ]
fit2_clean <- lm(Price ~ Age + Distance, data = clean_data)
summary(fit2_clean)
##
## Call:
## lm(formula = Price ~ Age + Distance, data = clean_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -411.50 -203.69 -45.24 191.11 492.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2502.467 75.024 33.356 < 2e-16 ***
## Age -8.674 3.221 -2.693 0.00869 **
## Distance -24.063 2.692 -8.939 1.57e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 256.8 on 77 degrees of freedom
## Multiple R-squared: 0.5361, Adjusted R-squared: 0.524
## F-statistic: 44.49 on 2 and 77 DF, p-value: 1.437e-13
fit3 <- lm(Price ~ Age + Distance + Parking + Balcony, data = mydata4)
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## Parking1 196.168 62.868 3.120 0.00251 **
## Balcony1 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
R² ≈ 0.84: the extended model explains slightly more variation in prices than fit2.
anova(fit2, fit3)
## Analysis of Variance Table
##
## Model 1: Price ~ Age + Distance
## Model 2: Price ~ Age + Distance + Parking + Balcony
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 82 6720983
## 2 80 5991088 2 729894 4.8732 0.01007 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(fit3)
##
## Call:
## lm(formula = Price ~ Age + Distance + Parking + Balcony, data = mydata4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -459.92 -200.66 -57.48 260.08 594.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2301.667 94.271 24.415 < 2e-16 ***
## Age -6.799 3.110 -2.186 0.03172 *
## Distance -18.045 2.758 -6.543 5.28e-09 ***
## Parking1 196.168 62.868 3.120 0.00251 **
## Balcony1 1.935 60.014 0.032 0.97436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.7 on 80 degrees of freedom
## Multiple R-squared: 0.5004, Adjusted R-squared: 0.4754
## F-statistic: 20.03 on 4 and 80 DF, p-value: 1.849e-11
fitted_values <- fitted(fit3)
residuals_fit3 <- residuals(fit3)
fitted_values[2]
## 2
## 2357.411
residuals_fit3[2]
## 2
## 442.5889