To administer baseline statistical analysis on a dataset and report descriptive analysis summary.
Step-1: Load the data and required R-packages for data analysis
Step-2: Apply basic statistic functions
Step-3: Create appropriate visualizations
Step-4: Report the findings based on descriptive analysis
Loading data
df<-read.csv("https://raw.githubusercontent.com/sijuswamy/StatLab/main/Dataset_1.csv",header = TRUE)
df$Gender=as.factor(df$Gender)
head(df)
## Student_Name Gender X20IMCAT201 X20IMCAT203 X20IMCAT205 X20IMCAT207
## 1 ABEL MATHEW ABRAHAM M 1.5 1.5 18 1
## 2 ABEN B JOHN M 8.5 6.0 26 5
## 3 ABIN SAJI M 0.0 0.0 21 0
## 4 ADWAITH SANIL M 8.0 13.0 18 4
## 5 AKSHAY BABU M 8.0 4.0 14 10
## 6 ALEN T BINU M 29.5 24.0 43 19
## X20IMCAT209
## 1 16
## 2 23
## 3 0
## 4 17
## 5 5
## 6 35
str(df)
## 'data.frame': 56 obs. of 7 variables:
## $ Student_Name: chr "ABEL MATHEW ABRAHAM" "ABEN B JOHN" "ABIN SAJI" "ADWAITH SANIL" ...
## $ Gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 1 1 2 ...
## $ X20IMCAT201 : num 1.5 8.5 0 8 8 29.5 16.5 21 25.5 30 ...
## $ X20IMCAT203 : num 1.5 6 0 13 4 24 8.5 15 11.5 32.5 ...
## $ X20IMCAT205 : num 18 26 21 18 14 43 25 22 25 32 ...
## $ X20IMCAT207 : num 1 5 0 4 10 19 10 11 1 12 ...
## $ X20IMCAT209 : num 16 23 0 17 5 35 12 17 30 37 ...
Finding Column sums
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df1=select(df,-Student_Name,-Gender)
Sub_total=colSums(df1)
Sub_average=colMeans(df1)
round(Sub_average,2)
## X20IMCAT201 X20IMCAT203 X20IMCAT205 X20IMCAT207 X20IMCAT209
## 20.54 12.17 26.68 11.29 24.68
library(ggplot2)
crop=ggplot(data=df, mapping=aes(x=Gender, y=X20IMCAT201))+geom_boxplot()+labs(x ="Gender", y = "Computer Organization marks")
crop
library(ggplot2)
crop=ggplot(data=df, mapping=aes(x=Gender, y=X20IMCAT203))+geom_boxplot()+labs(x ="Gender", y = "Mathematics marks")
crop
library(ggplot2)
crop=ggplot(data=df, mapping=aes(x=Gender, y=X20IMCAT205))+geom_boxplot()+labs(x ="Gender", y = "OOPS marks")
crop
library(ggplot2)
crop=ggplot(data=df, mapping=aes(x=Gender, y=X20IMCAT207))+geom_boxplot()+labs(x ="Gender", y = "Accountancy marks")
crop
library(ggplot2)
crop=ggplot(data=df, mapping=aes(x=Gender, y=X20IMCAT209))+geom_boxplot()+labs(x ="Gender", y = "DS marks")
crop
ggplot(data = df, aes(x = X20IMCAT201,fill = df$Gender)) + geom_histogram(binwidth = 5, fill = "seagreen",color = "red")+
theme(legend.position = "top")#+facet_grid(~Gender)
ggplot(data = df, aes(x = X20IMCAT203,fill = df$Gender)) + geom_histogram(binwidth = 5, fill = "seagreen",color = "red")+
theme(legend.position = "top")#+facet_grid(~Gender)
ggplot(data = df, aes(x = X20IMCAT205,fill = df$Gender)) + geom_histogram(binwidth = 5, fill = "seagreen",color = "red")+
theme(legend.position = "top")#+facet_grid(~Gender)
ggplot(data = df, aes(x = X20IMCAT207,fill = df$Gender)) + geom_histogram(binwidth = 5, fill = "seagreen",color = "red")+
theme(legend.position = "top")#+facet_grid(~Gender)
ggplot(data = df, aes(x = X20IMCAT209,fill = df$Gender)) + geom_histogram(binwidth = 5, fill = "seagreen",color = "red")+
theme(legend.position = "top")#+facet_grid(~Gender)
plot(density(df$X20IMCAT201))
plot(density(df$X20IMCAT203))
plot(density(df$X20IMCAT205))
plot(density(df$X20IMCAT207))
plot(density(df$X20IMCAT209))
median(df$X20IMCAT201)
## [1] 21
library(DescTools)
Mode(df$X20IMCAT201)
## [1] 21
## attr(,"freq")
## [1] 4
User defined funtion
var(df$X20IMCAT201)
## [1] 81.95706
var(df$X20IMCAT203)
## [1] 65.95706
var(df$X20IMCAT205)
## [1] 102.113
var(df$X20IMCAT207)
## [1] 60.75325
var(df$X20IMCAT209)
## [1] 100.6948
cor(df$X20IMCAT203,df$X20IMCAT207)
## [1] 0.3856813
calcmode <- function(a) {
vector <- unique(a)
vector[which.max(tabulate(match(a, vector)))]
}
calcmode(df$X20IMCAT201)
## [1] 21
sd(df$X20IMCAT201)
## [1] 9.053014
Various statistical analysis were experimental on a secondary data and appropriate visualizations were used to interrupt in statistical estimates.