#loading libraries
library(readr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
#loading data
df<-read_csv("Application1.csv")
## Rows: 10320 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): budget, schoolcode, educationscore, classsize
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#examining the data
describe(df)
## df
##
## 4 Variables 10320 Observations
## --------------------------------------------------------------------------------
## budget
## n missing distinct Info Mean Gmd
## 10320 0 9 0.986 5.257 3.692
##
## lowest : 1 2 3 4 5, highest: 5 6 7 8 13
##
## Value 1 2 3 4 5 6 7 8 13
## Frequency 969 1277 1794 1142 914 1069 1159 853 1143
## Proportion 0.094 0.124 0.174 0.111 0.089 0.104 0.112 0.083 0.111
## --------------------------------------------------------------------------------
## schoolcode
## n missing distinct Info Mean Gmd .05 .10
## 10320 0 160 1 4632 2855 1129 1617
## .25 .50 .75 .90 .95
## 2478 4677 6393 8365 8904
##
## lowest : 767 785 789 815 981, highest: 8984 8989 9081 9085 9173
## --------------------------------------------------------------------------------
## educationscore
## n missing distinct Info Mean Gmd .05 .10
## 10320 0 92 1 0.3864 0.1998 0.14 0.16
## .25 .50 .75 .90 .95
## 0.25 0.36 0.50 0.64 0.73
##
## lowest : 0.09 0.10 0.11 0.12 0.13, highest: 0.96 0.97 0.98 1.01 1.02
## --------------------------------------------------------------------------------
## classsize
## n missing distinct Info Mean Gmd
## 10320 0 4 0.84 2.421 0.7569
##
## Value 1 2 3 4
## Frequency 1042 4302 4568 408
## Proportion 0.101 0.417 0.443 0.040
## --------------------------------------------------------------------------------
summary(df)
## budget schoolcode educationscore classsize
## Min. : 1.000 Min. : 767 Min. :0.0900 Min. :1.000
## 1st Qu.: 3.000 1st Qu.:2478 1st Qu.:0.2500 1st Qu.:2.000
## Median : 4.000 Median :4677 Median :0.3600 Median :2.000
## Mean : 5.257 Mean :4632 Mean :0.3864 Mean :2.421
## 3rd Qu.: 7.000 3rd Qu.:6393 3rd Qu.:0.5000 3rd Qu.:3.000
## Max. :13.000 Max. :9173 Max. :1.0200 Max. :4.000
#data prep: applying the factor function for categorical variables
df$budget<-as.factor(df$budget)
df$classsize<-as.factor(df$classsize)
#modeling
model1 <- lm(educationscore ~ classsize, data = df)
#analysis
summary(model1)
##
## Call:
## lm(formula = educationscore ~ classsize, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31876 -0.11876 -0.03876 0.08058 0.51058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.570499 0.004742 120.30 <2e-16 ***
## classsize2 -0.131738 0.005285 -24.93 <2e-16 ***
## classsize3 -0.261077 0.005255 -49.68 <2e-16 ***
## classsize4 -0.343318 0.008940 -38.40 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1531 on 10316 degrees of freedom
## Multiple R-squared: 0.2591, Adjusted R-squared: 0.2589
## F-statistic: 1202 on 3 and 10316 DF, p-value: < 2.2e-16