##
## Attaching package: 'tufte'
## The following objects are masked from 'package:tint':
##
## margin_note, newthought, quote_footer, sans_serif
Visit the following link for more information in R: RStudio
data<-read.csv("data_data.csv")
head(data)
## Timestamp Institution
## 1 04/11/2024 11:42 UR
## 2 04/11/2024 11:42 UR
## 3 04/11/2024 11:43 UR
## 4 04/11/2024 11:43 UR
## 5 04/11/2024 11:43 UR
## 6 04/11/2024 11:44 AIMS
## Background
## 1 Data Science
## 2 Data Science
## 3 Applied Statistics
## 4 Bachelor in Science of Quantity Surveying and construction management
## 5 Data science
## 6 Data science
## Completed.Program Gender Date.of.Birth
## 1 Master's degree Female 02/04/1986
## 2 Master's degree Male 08/04/1990
## 3 Bachelor's degree Male 17/08/2000
## 4 Bachelor's degree Male 01/01/2000
## 5 Master's degree Female 23/10/1994
## 6 Master's degree Female 01/01/1999
View(data)
Quadratic equation: \[ax^2+bx+c\] whereby \[a,b,c\] are real numbers
Equation of normal distribution: \[p(x; \mu, \sigma) = \frac{1}{\sigma \sqrt{2 \pi}} e^{\frac{-(x-\mu)^2}{2 \sigma^2}}\]
data.frame(data$Institution)
## data.Institution
## 1 UR
## 2 UR
## 3 UR
## 4 UR
## 5 UR
## 6 AIMS
## 7 UR
## 8 UR
## 9 UR
## 10 UR
## 11 UR
## 12 UR
## 13 UR
## 14 UR
## 15 UR
## 16 UR
## 17 UR
## 18 UR
## 19 UR
## 20 UR
## 21 UR
## 22 UR
## 23 UR
## 24 University of Kerala
## 25 UR
## 26 EAUR
## 27 UR
## 28 UR
## 29 UTB
## 30 EAUR
## 31 UTB
## 32 UR
## 33 UR
## 34 UR
## 35 AIMS Rwanda
## 36 UTB
## 37 INES Ruhengeri
## 38 INES
## 39 INES
## 40 UR
## 41 UR
## 42 UR
## 43 AUCA
## 44 UR
## 45 UR
## 46 UR
## 47 UR
## 48 UR
## 49 UR
## 50 UR
## 51 UR
data.frame(data[c(1:6),3])
## data.c.1.6...3.
## 1 Data Science
## 2 Data Science
## 3 Applied Statistics
## 4 Bachelor in Science of Quantity Surveying and construction management
## 5 Data science
## 6 Data science
#filter male who studied bachelor's degree
data$Completed.Program[data$Gender=="Male"&data$Completed.Program=="Bachelor's degree"]
## [1] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [4] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [7] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [10] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [13] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [16] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [19] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [22] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [25] "Bachelor's degree" "Bachelor's degree"
data[data$Gender=="Male"&data$Completed.Program=="Bachelor's degree",c(1,2)]
## Timestamp Institution
## 3 04/11/2024 11:43 UR
## 4 04/11/2024 11:43 UR
## 7 04/11/2024 11:45 UR
## 8 04/11/2024 11:46 UR
## 11 04/11/2024 11:48 UR
## 12 04/11/2024 11:48 UR
## 13 04/11/2024 11:48 UR
## 14 04/11/2024 11:49 UR
## 17 04/11/2024 11:49 UR
## 20 04/11/2024 11:50 UR
## 22 04/11/2024 11:51 UR
## 23 04/11/2024 11:51 UR
## 25 04/11/2024 11:55 UR
## 28 04/11/2024 11:57 UR
## 30 04/11/2024 11:57 EAUR
## 31 04/11/2024 11:58 UTB
## 33 04/11/2024 12:01 UR
## 36 04/11/2024 12:05 UTB
## 43 06/11/2024 9:26 AUCA
## 44 06/11/2024 9:56 UR
## 45 06/11/2024 10:08 UR
## 46 06/11/2024 10:26 UR
## 47 07/11/2024 9:04 UR
## 48 07/11/2024 9:16 UR
## 49 07/11/2024 9:37 UR
## 50 07/11/2024 10:24 UR
# find student who completed master's degree and studied Information Systems
data1<-data[data$Completed.Program=="Bachelor's degree"&data$Gender=="Male",]
data1$Completed.Program
## [1] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [4] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [7] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [10] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [13] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [16] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [19] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [22] "Bachelor's degree" "Bachelor's degree" "Bachelor's degree"
## [25] "Bachelor's degree" "Bachelor's degree"
length(data1$Completed.Program)
## [1] 26
#student who completed Bachelor's degree
data2<-data[data$Completed.Program=="Bachelor's degree",]
pie(table(data2$Gender),col = c("#FF0000","#00FFFF"))
# Sample data
set.seed(42)
Gender <- sample(c("Male", "Female"), 100, replace = TRUE)
Purchased <- sample(c("Yes", "No"), 100, replace = TRUE)
data <- data.frame(Gender, Purchased)
# Contingency table
table_data <- table(data$Gender, data$Purchased)
print(table_data)
##
## No Yes
## Female 29 27
## Male 26 18
# Chi-square test
chisq.test(table_data)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table_data
## X-squared = 0.27712, df = 1, p-value = 0.5986
# Bar plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
ggplot(data, aes(x=Gender, fill=Purchased)) +
geom_bar(position="dodge") +
labs(title="Gender vs. Purchased", x="Gender", y="Count")
# Mosaic plot
mosaicplot(table_data, main="Mosaic Plot of Gender vs Purchased", color=TRUE)
# Load necessary libraries
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Set seed for reproducibility
set.seed(123)
# Create a sample dataset
data <- data.frame(
nominal_predictor = factor(sample(c("A", "B", "C"), 100, replace = TRUE)),
ordinal_predictor = factor(sample(c("Low", "Medium", "High"), 100, replace = TRUE), ordered = TRUE),
numeric_target = rnorm(100, mean = 50, sd = 10),
categorical_target = factor(sample(c("Yes", "No"), 100, replace = TRUE))
)
# Inspect the data
head(data)
## nominal_predictor ordinal_predictor numeric_target categorical_target
## 1 C High 54.51504 No
## 2 C Medium 50.41233 Yes
## 3 C Medium 45.77503 Yes
## 4 B High 29.46753 No
## 5 C Low 61.31337 No
## 6 B Low 35.39360 Yes
# Box Plot of numeric target across levels of nominal predictor
ggplot(data, aes(x = nominal_predictor, y = numeric_target)) +
geom_boxplot() +
labs(title = "Box Plot of Numeric Target by Nominal Predictor", x = "Nominal Predictor", y = "Numeric Target")
# Perform ANOVA to test for differences
anova_result <- aov(numeric_target ~ nominal_predictor, data = data)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## nominal_predictor 2 199 99.26 0.949 0.391
## Residuals 97 10150 104.63
# Kruskal-Wallis Test (non-parametric alternative)
kruskal.test(numeric_target ~ nominal_predictor, data = data)
##
## Kruskal-Wallis rank sum test
##
## data: numeric_target by nominal_predictor
## Kruskal-Wallis chi-squared = 1.0731, df = 2, p-value = 0.5848
# Box Plot with Ordered Levels
ggplot(data, aes(x = ordinal_predictor, y = numeric_target)) +
geom_boxplot() +
labs(title = "Box Plot of Numeric Target by Ordinal Predictor", x = "Ordinal Predictor", y = "Numeric Target")
# ANOVA to check for differences
anova_result_ordinal <- aov(numeric_target ~ ordinal_predictor, data = data)
summary(anova_result_ordinal)
## Df Sum Sq Mean Sq F value Pr(>F)
## ordinal_predictor 2 47 23.68 0.223 0.801
## Residuals 97 10301 106.19
# Spearman’s Rank Correlation
cor.test(as.numeric(data$ordinal_predictor), data$numeric_target, method = "spearman")
## Warning in cor.test.default(as.numeric(data$ordinal_predictor),
## data$numeric_target, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: as.numeric(data$ordinal_predictor) and data$numeric_target
## S = 165872, p-value = 0.9632
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.004667927
# Contingency Table
table(data$ordinal_predictor, data$categorical_target)
##
## No Yes
## High 20 12
## Low 18 12
## Medium 19 19
# Chi-squared Test for Association
chisq_test_ordinal <- chisq.test(data$ordinal_predictor, data$categorical_target)
chisq_test_ordinal
##
## Pearson's Chi-squared test
##
## data: data$ordinal_predictor and data$categorical_target
## X-squared = 1.2648, df = 2, p-value = 0.5313
# Ordinal Logistic Regression if target is also ordinal (for illustration)
# Here we assume the categorical target has an order, e.g., "Low" < "Medium" < "High"
# data$ordered_target <- factor(data$categorical_target, ordered = TRUE)
# ord_log_model <- polr(ordered_target ~ ordinal_predictor, data = data, Hess = TRUE)
# summary(ord_log_model)
mt_data<-mtcars
print(mt_data)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
View(mt_data)
hist(mt_data$mpg,col = "skyblue")
pairs(mt_data)
names(mt_data)
## [1] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear"
## [11] "carb"
library(MASS)
lm_model<-lm(mpg~cyl+drat,data = mt_data)
summary(lm_model)
##
## Call:
## lm(formula = mpg ~ cyl + drat, data = mt_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0845 -2.1061 -0.3432 1.8000 7.2096
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 28.7247 7.5921 3.783 0.000718 ***
## cyl -2.4835 0.4472 -5.554 5.45e-06 ***
## drat 1.8720 1.4937 1.253 0.220124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.176 on 29 degrees of freedom
## Multiple R-squared: 0.7402, Adjusted R-squared: 0.7223
## F-statistic: 41.32 on 2 and 29 DF, p-value: 3.244e-09
names(lm_model)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
#this how to find out model estimated coefficient
lm_model$coefficients
## (Intercept) cyl drat
## 28.724665 -2.483514 1.871983
coef(lm_model)
## (Intercept) cyl drat
## 28.724665 -2.483514 1.871983
#confidence interval
confint(lm_model)
## 2.5 % 97.5 %
## (Intercept) 13.196989 44.252341
## cyl -3.398123 -1.568905
## drat -1.182973 4.926938
#prediction of the model
mdf<-data.frame(predict(lm_model,interval = "prediction")) # interval="prediction"
## Warning in predict.lm(lm_model, interval = "prediction"): predictions on current data refer to _future_ responses
mdf[,c("lwr","upr")]
## lwr upr
## Mazda RX4 14.477977 27.77065
## Mazda RX4 Wag 14.477977 27.77065
## Datsun 710 19.219731 32.77575
## Hornet 4 Drive 12.777079 26.40150
## Hornet Sportabout 8.048752 21.45785
## Valiant 11.871128 26.10938
## Duster 360 8.164141 21.56709
## Merc 240D 18.857724 32.53873
## Merc 230 19.367170 32.89039
## Merc 280 14.507722 27.81578
## Merc 280C 14.507722 27.81578
## Merc 450SE 7.887116 21.31996
## Merc 450SL 7.887116 21.31996
## Merc 450SLC 7.887116 21.31996
## Cadillac Fleetwood 7.583004 21.09992
## Lincoln Continental 7.738422 21.20658
## Chrysler Imperial 8.201490 21.60462
## Fiat 128 19.678835 33.17776
## Honda Civic 20.760316 35.27865
## Toyota Corolla 19.922538 33.45821
## Toyota Corona 18.881356 32.55253
## Dodge Challenger 7.178054 20.86840
## AMC Javelin 8.048752 21.45785
## Camaro Z28 8.956619 22.72148
## Pontiac Firebird 7.907806 21.33671
## Fiat X1-9 19.678835 33.17776
## Porsche 914-2 20.237900 33.92908
## Lotus Europa 19.043043 32.65292
## Ford Pantera L 9.387272 24.12537
## Ferrari Dino 14.002813 27.19750
## Maserati Bora 8.709559 22.25719
## Volvo 142E 19.733333 33.23558
plot(predict(lm_model), residuals(lm_model))
Thank you!!!!