#loading libraries
library(readr)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
#loading data
df<-read_csv("Application1.csv")
## Rows: 10320 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): budget, schoolcode, educationscore, classsize
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#examining the data
describe(df)
## df 
## 
##  4  Variables      10320  Observations
## --------------------------------------------------------------------------------
## budget 
##        n  missing distinct     Info     Mean      Gmd 
##    10320        0        9    0.986    5.257    3.692 
## 
## lowest :  1  2  3  4  5, highest:  5  6  7  8 13
##                                                                 
## Value          1     2     3     4     5     6     7     8    13
## Frequency    969  1277  1794  1142   914  1069  1159   853  1143
## Proportion 0.094 0.124 0.174 0.111 0.089 0.104 0.112 0.083 0.111
## --------------------------------------------------------------------------------
## schoolcode 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    10320        0      160        1     4632     2855     1129     1617 
##      .25      .50      .75      .90      .95 
##     2478     4677     6393     8365     8904 
## 
## lowest :  767  785  789  815  981, highest: 8984 8989 9081 9085 9173
## --------------------------------------------------------------------------------
## educationscore 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    10320        0       92        1   0.3864   0.1998     0.14     0.16 
##      .25      .50      .75      .90      .95 
##     0.25     0.36     0.50     0.64     0.73 
## 
## lowest : 0.09 0.10 0.11 0.12 0.13, highest: 0.96 0.97 0.98 1.01 1.02
## --------------------------------------------------------------------------------
## classsize 
##        n  missing distinct     Info     Mean      Gmd 
##    10320        0        4     0.84    2.421   0.7569 
##                                   
## Value          1     2     3     4
## Frequency   1042  4302  4568   408
## Proportion 0.101 0.417 0.443 0.040
## --------------------------------------------------------------------------------
summary(df)
##      budget         schoolcode   educationscore     classsize    
##  Min.   : 1.000   Min.   : 767   Min.   :0.0900   Min.   :1.000  
##  1st Qu.: 3.000   1st Qu.:2478   1st Qu.:0.2500   1st Qu.:2.000  
##  Median : 4.000   Median :4677   Median :0.3600   Median :2.000  
##  Mean   : 5.257   Mean   :4632   Mean   :0.3864   Mean   :2.421  
##  3rd Qu.: 7.000   3rd Qu.:6393   3rd Qu.:0.5000   3rd Qu.:3.000  
##  Max.   :13.000   Max.   :9173   Max.   :1.0200   Max.   :4.000
#data prep: applying the factor function for categorical variables
df$budget<-as.factor(df$budget)
df$classsize<-as.factor(df$classsize)

#modeling
model1 <- lm(educationscore ~ classsize, data = df)

#analysis
summary(model1)
## 
## Call:
## lm(formula = educationscore ~ classsize, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.31876 -0.11876 -0.03876  0.08058  0.51058 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.570499   0.004742  120.30   <2e-16 ***
## classsize2  -0.131738   0.005285  -24.93   <2e-16 ***
## classsize3  -0.261077   0.005255  -49.68   <2e-16 ***
## classsize4  -0.343318   0.008940  -38.40   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1531 on 10316 degrees of freedom
## Multiple R-squared:  0.2591, Adjusted R-squared:  0.2589 
## F-statistic:  1202 on 3 and 10316 DF,  p-value: < 2.2e-16