Independet_project

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

academic_performance <- read_excel("data_academic_performance.xlsx")

#First check that package required is installed, if not install it
# Specify your packages
needed_packages <- c("pastecs", "ggplot2", "semTools", "FSA")                                    
# Extract not installed packages
not_installed <- needed_packages[!(needed_packages %in% installed.packages()[ , "Package"])]    
# Install not installed packages
if(length(not_installed)) install.packages(not_installed)                              
library(pastecs) #For creating descriptive statistic summaries

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(ggplot2) #For creating histograms with more detail than plot
library(semTools) #For skewness and kurtosis

## Warning: package 'semTools' was built under R version 4.0.3

## Loading required package: lavaan

## This is lavaan 0.6-7

## lavaan is BETA software! Please report any bugs.

##

## ###############################################################################

## This is semTools 0.5-3

## All users of R (or SEM) are invited to submit functions or ideas for functions.

## ###############################################################################

library(stargazer)

## Warning: package 'stargazer' was built under R version 4.0.3

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

library(sjstats)#chi-square effect size

## Warning: package 'sjstats' was built under R version 4.0.3

library(gmodels) #For creating histograms with more detail than plot

## Warning: package 'gmodels' was built under R version 4.0.3

## 
## Attaching package: 'gmodels'

## The following object is masked from 'package:sjstats':
## 
##     ci

##Checking the representativeness of the data ##

academic_performance$GENDER<- as.factor(academic_performance$GENDER)
table(academic_performance$GENDER)

## 
##    F    M 
## 5043 7368

academic_performance$SCHOOL_NAT<- as.factor(academic_performance$SCHOOL_NAT)
table(academic_performance$SCHOOL_NAT)

## 
## PRIVATE  PUBLIC 
##    6565    5846

##missing data handling

df_2<- academic_performance %>% dplyr::na_if(0)
df_2<- df_2 %>% dplyr::na_if('Not sure')
df_2<- df_2 %>% dplyr::na_if('Not apply')
myData<- na.omit(df_2)
myData$PEOPLE_HOUSE=as.factor(ifelse(myData$PEOPLE_HOUSE=="Once","One",myData$PEOPLE_HOUSE))

Checking normality

#We will allocate the histogram to a variable to allow use to manipulate it
# for maths 
gg <- ggplot(myData, aes(x=MAT_S11))
#Change the label of the x axis
gg <- gg + labs(x="maths_score")
#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#adding a normal curve
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(myData$MAT_S11, na.rm=TRUE), sd=sd(myData$MAT_S11, na.rm=TRUE)))
#to display the graph request the contents of the variable be shown
gg

#Create a qqplot
qqnorm(myData$MAT_S11)
qqline(myData$MAT_S11, col=2) #show a line on theplot

###  Generate Summary Statistics 
pastecs::stat.desc(myData$MAT_S11, basic=F)

##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   64.0000000   64.4024564    0.1156878    0.2267700  140.5686361   11.8561645 
##     coef.var 
##    0.1840949

#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
tpskew<-semTools::skew(myData$MAT_S11)
tpkurt<-semTools::kurtosis(myData$MAT_S11)
tpskew[1]/tpskew[2]

## skew (g1) 
##  16.11356

tpkurt[1]/tpkurt[2]

## Excess Kur (g2) 
##        1.847239

#and by calculating the percentage of standardised scores for the variable itself that are outside our acceptable range
#This will tell us how big a problem we have
# Calculate the percentage of standardised scores that are greated than 1.96
zMathscore<- abs(scale(myData$MAT_S11))
FSA::perc(as.numeric(zMathscore), 1.96, "gt")

## [1] 4.246406

FSA::perc(as.numeric(zMathscore), 3.29, "gt")

## [1] 0

#For critical reading
gg <- ggplot(myData, aes(x=CR_S11))
#Change the label of the x axis
gg <- gg + labs(x="Citcal_Reading_score")
#manage binwidth and colours
gg <- gg + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gg <- gg + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
#adding a normal curve
#use stat_function to compute a normalised score for
gg <- gg + stat_function(fun=dnorm, color="red",args=list(mean=mean(myData$CR_S11, na.rm=TRUE), sd=sd(myData$CR_S11, na.rm=TRUE)))
#to display the graph request the contents of the variable be shown
gg

#Create a qqplot
qqnorm(myData$CR_S11)
qqline(myData$CR_S11, col=2) #show a line on theplot

###  Generate Summary Statistics 
pastecs::stat.desc(myData$CR_S11, basic=F)

##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##  61.00000000  60.86956108   0.09795809   0.19201645 100.78454567  10.03915065 
##     coef.var 
##   0.16492891

#We can make our decision based on the value of the standardised score for skew and kurtosis
#We divide the skew statistic by the standard error to get the standardised score
tpskew<-semTools::skew(myData$CR_S11)
tpkurt<-semTools::kurtosis(myData$CR_S11)
tpskew[1]/tpskew[2]

## skew (g1) 
##  9.572668

tpkurt[1]/tpkurt[2]

## Excess Kur (g2) 
##        9.651906

#and by calculating the percentage of standardised scores for the variable itself that are outside our acceptable range
#This will tell us how big a problem we have
# Calculate the percentage of standardised scores that are greated than 1.96
zCRscore<- abs(scale(myData$CR_S11))
FSA::perc(as.numeric(zCRscore), 1.96, "gt")

## [1] 5.560316

FSA::perc(as.numeric(zCRscore), 3.29, "gt")

## [1] 0.3903647

difference test

# t-test#
#for CR_S11#
#difference in critical writing score based on school nature#
#Get descriptive stastitics by group - output as a matrix
psych::describeBy(myData$CR_S11, myData$SCHOOL_NAT,  mat=TRUE)

##     item  group1 vars    n     mean       sd median  trimmed    mad min max
## X11    1 PRIVATE    1 5361 62.74408 9.835085   62.0 62.61203 8.8956  28 100
## X12    2  PUBLIC    1 5142 58.91521 9.876647   58.5 58.69397 9.6369  26 100
##     range      skew  kurtosis        se
## X11    72 0.2129527 0.6212798 0.1343245
## X12    74 0.2770183 0.4232489 0.1377347

#Conduct Levene's test for homogeneity of variance
car::leveneTest(CR_S11 ~ SCHOOL_NAT, data=myData)

## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value Pr(>F)
## group     1   0.563 0.4531
##       10501

#Conduct the t-test from package stats
res <- stats::t.test(CR_S11 ~ SCHOOL_NAT,var.equal=TRUE,data=myData)
res

## 
##  Two Sample t-test
## 
## data:  CR_S11 by SCHOOL_NAT
## t = 19.903, df = 10501, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.451782 4.205957
## sample estimates:
## mean in group PRIVATE  mean in group PUBLIC 
##              62.74408              58.91521

#Eta squared calculation
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes

##     t 
## 0.036

#for MAT_S11#
#difference in math score based on school nature#
#Get descriptive stastitics by group - output as a matrix
psych::describeBy(myData$MAT_S11, myData$SCHOOL_NAT,  mat=TRUE)

##     item  group1 vars    n     mean       sd median  trimmed     mad min max
## X11    1 PRIVATE    1 5361 67.04346 12.10437     67 66.66146 11.8608  32 100
## X12    2  PUBLIC    1 5142 61.64897 10.93338     61 61.22460 10.3782  26 100
##     range      skew    kurtosis        se
## X11    68 0.3009775 -0.04596034 0.1653177
## X12    74 0.4078466  0.25677717 0.1524715

#Conduct Levene's test for homogeneity of variance
car::leveneTest(MAT_S11 ~ SCHOOL_NAT, data=myData)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value   Pr(>F)    
## group     1  53.986 2.17e-13 ***
##       10501                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## does not meet the assumption i.e. non equal variance 
#Conduct the t-test from package stats
#In this case we can use the var.equal = FALSE option to specify non-equal variances 
res <- stats::t.test(MAT_S11 ~ SCHOOL_NAT,var.equal=FALSE,data=myData)
res

## 
##  Welch Two Sample t-test
## 
## data:  MAT_S11 by SCHOOL_NAT
## t = 23.987, df = 10463, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  4.953657 5.835329
## sample estimates:
## mean in group PRIVATE  mean in group PUBLIC 
##              67.04346              61.64897

#Eta squared calculation
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes

##     t 
## 0.052

#### one way ANOVA#
#for CR_S11#
#Get descriptive stastitics by group - output as a matrix
psych::describeBy(myData$CR_S11, myData$SCHOOL_TYPE,  mat=TRUE)

##     item             group1 vars    n     mean       sd median  trimmed     mad
## X11    1           ACADEMIC    1 6547 61.92103 9.976892     62 61.77400 10.3782
## X12    2          TECHNICAL    1  895 58.74637 9.719524     58 58.64575  8.8956
## X13    3 TECHNICAL/ACADEMIC    1 3061 59.24142 9.951165     59 59.04655 10.3782
##     min max range      skew  kurtosis        se
## X11  28 100    72 0.2308671 0.4930983 0.1233031
## X12  32 100    68 0.2280661 0.4947679 0.3248879
## X13  26 100    74 0.2383928 0.4350684 0.1798632

#Conduct Bartlett's test for homogeneity of variance
stats::bartlett.test(CR_S11 ~ SCHOOL_TYPE, data=myData)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  CR_S11 by SCHOOL_TYPE
## Bartlett's K-squared = 1.0611, df = 2, p-value = 0.5883

#In this case we can use Tukey as the post-hoc test option since variances in the groups are equal
userfriendlyscience::oneway(as.factor(myData$SCHOOL_TYPE),y=myData$CR_S11,posthoc='Tukey')

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## ### Oneway Anova for y=CR_S11 and x=SCHOOL_TYPE (groups: ACADEMIC, TECHNICAL, TECHNICAL/ACADEMIC)

## Registered S3 methods overwritten by 'ufs':
##   method                     from               
##   grid.draw.ggProportionPlot userfriendlyscience
##   pander.associationMatrix   userfriendlyscience
##   pander.dataShape           userfriendlyscience
##   pander.descr               userfriendlyscience
##   pander.normalityAssessment userfriendlyscience
##   print.CramersV             userfriendlyscience
##   print.associationMatrix    userfriendlyscience
##   print.confIntOmegaSq       userfriendlyscience
##   print.confIntV             userfriendlyscience
##   print.dataShape            userfriendlyscience
##   print.descr                userfriendlyscience
##   print.ggProportionPlot     userfriendlyscience
##   print.meanConfInt          userfriendlyscience
##   print.multiVarFreq         userfriendlyscience
##   print.normalityAssessment  userfriendlyscience
##   print.regrInfluential      userfriendlyscience
##   print.scaleDiagnosis       userfriendlyscience
##   print.scaleStructure       userfriendlyscience
##   print.scatterMatrix        userfriendlyscience

## Omega squared: 95% CI = [.01; .02], point estimate = .02
## Eta Squared: 95% CI = [.01; .02], point estimate = .02
## 
##                                         SS    Df      MS     F     p
## Between groups (error + effect)   19387.11     2 9693.56 97.96 <.001
## Within groups (error only)      1039052.19 10500   98.96            
## 
## 
## ### Post hoc test: Tukey
## 
##                              diff  lwr   upr   p adj
## TECHNICAL-ACADEMIC           -3.17 -4.01 -2.34 <.001
## TECHNICAL/ACADEMIC-ACADEMIC  -2.68 -3.19 -2.17 <.001
## TECHNICAL/ACADEMIC-TECHNICAL 0.5   -0.39 1.38  .39

#using AOV function
res2<-stats::aov(CR_S11 ~ SCHOOL_TYPE, data = myData)
res2

## Call:
##    stats::aov(formula = CR_S11 ~ SCHOOL_TYPE, data = myData)
## 
## Terms:
##                 SCHOOL_TYPE Residuals
## Sum of Squares      19387.1 1039052.2
## Deg. of Freedom           2     10500
## 
## Residual standard error: 9.947731
## Estimated effects may be unbalanced

#Get the F statistic into a variable to make reporting easier
fstat<-summary(res2)[[1]][["F value"]][[1]]
fstat

## [1] 97.95691

#Get the p value into a variable to make reporting easier
aovpvalue<-summary(res2)[[1]][["Pr(>F)"]][[1]]
aovpvalue

## [1] 7.07739e-43

#Calculate effect
aoveta<-sjstats::eta_sq(res2)[2]
aoveta

##   etasq
## 1 0.018

#for MAT_s11#
#Get descriptive stastitics by group - output as a matrix
psych::describeBy(myData$MAT_S11, myData$SCHOOL_TYPE,  mat=TRUE)

##     item             group1 vars    n     mean       sd median  trimmed     mad
## X11    1           ACADEMIC    1 6547 65.98503 12.05318     65 65.56270 11.8608
## X12    2          TECHNICAL    1  895 61.31620 11.04250     60 60.82845 10.3782
## X13    3 TECHNICAL/ACADEMIC    1 3061 61.91996 11.03259     61 61.53165 10.3782
##     min max range      skew    kurtosis        se
## X11  26 100    74 0.3356335 -0.02428624 0.1489636
## X12  31 100    69 0.4588160  0.39263192 0.3691101
## X13  32 100    68 0.4024213  0.27914741 0.1994095

#Conduct Bartlett's test for homogeneity of variance
stats::bartlett.test(MAT_S11 ~ SCHOOL_TYPE, data=myData)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  MAT_S11 by SCHOOL_TYPE
## Bartlett's K-squared = 37.809, df = 2, p-value = 6.165e-09

#In this case we can use Games-Howell as the post-hoc test option since variances in the groups are not equal
userfriendlyscience::oneway(as.factor(myData$SCHOOL_TYPE),y=myData$MAT_S11,posthoc='Games-Howell')

## ### Oneway Anova for y=MAT_S11 and x=SCHOOL_TYPE (groups: ACADEMIC, TECHNICAL, TECHNICAL/ACADEMIC)
## 
## Omega squared: 95% CI = [.02; .04], point estimate = .03
## Eta Squared: 95% CI = [.02; .04], point estimate = .03
## 
##                                         SS    Df       MS      F     p
## Between groups (error + effect)   43786.38     2 21893.19 160.48 <.001
## Within groups (error only)      1432465.44 10500   136.43             
## 
## 
## ### Post hoc test: Games-Howell
## 
##                               diff ci.lo ci.hi     t      df     p
## TECHNICAL-ACADEMIC           -4.67 -5.60 -3.73 11.73 1204.57 <.001
## TECHNICAL/ACADEMIC-ACADEMIC  -4.07 -4.65 -3.48 16.33 6484.22 <.001
## TECHNICAL/ACADEMIC-TECHNICAL  0.60 -0.38  1.59  1.44 1455.78  .321

#using AOV function
res2<-stats::aov(MAT_S11 ~ SCHOOL_TYPE, data = myData)
res2

## Call:
##    stats::aov(formula = MAT_S11 ~ SCHOOL_TYPE, data = myData)
## 
## Terms:
##                 SCHOOL_TYPE Residuals
## Sum of Squares      43786.4 1432465.4
## Deg. of Freedom           2     10500
## 
## Residual standard error: 11.68012
## Estimated effects may be unbalanced

#Get the F statistic into a variable to make reporting easier
fstat<-summary(res2)[[1]][["F value"]][[1]]
fstat

## [1] 160.4775

#Get the p value into a variable to make reporting easier
aovpvalue<-summary(res2)[[1]][["Pr(>F)"]][[1]]
aovpvalue

## [1] 2.235959e-69

#Calculate effect
aoveta<-sjstats::eta_sq(res2)[2]
aoveta

##   etasq
## 1  0.03

###chisquare test#
##Comparing Nominal Variables ###Contingency table
#Use the Crosstable function
#CrossTable(predictor, outcome, chisq = TRUE, expected = TRUE)
gmodels::CrossTable(myData$PEOPLE_HOUSE, myData$SCHOOL_NAT, chisq = TRUE, expected = TRUE, sresid = TRUE, format = "SPSS")

## 
##    Cell Contents
## |-------------------------|
## |                   Count |
## |         Expected Values |
## | Chi-square contribution |
## |             Row Percent |
## |          Column Percent |
## |           Total Percent |
## |            Std Residual |
## |-------------------------|
## 
## Total Observations in Table:  10503 
## 
##                     | myData$SCHOOL_NAT 
## myData$PEOPLE_HOUSE |  PRIVATE  |   PUBLIC  | Row Total | 
## --------------------|-----------|-----------|-----------|
##               Eight |       44  |       94  |      138  | 
##                     |   70.439  |   67.561  |           | 
##                     |    9.924  |   10.346  |           | 
##                     |   31.884% |   68.116% |    1.314% | 
##                     |    0.821% |    1.828% |           | 
##                     |    0.419% |    0.895% |           | 
##                     |   -3.150  |    3.217  |           | 
## --------------------|-----------|-----------|-----------|
##                Five |     1169  |     1307  |     2476  | 
##                     | 1263.814  | 1212.186  |           | 
##                     |    7.113  |    7.416  |           | 
##                     |   47.213% |   52.787% |   23.574% | 
##                     |   21.806% |   25.418% |           | 
##                     |   11.130% |   12.444% |           | 
##                     |   -2.667  |    2.723  |           | 
## --------------------|-----------|-----------|-----------|
##                Four |     2201  |     1821  |     4022  | 
##                     | 2052.932  | 1969.068  |           | 
##                     |   10.679  |   11.134  |           | 
##                     |   54.724% |   45.276% |   38.294% | 
##                     |   41.056% |   35.414% |           | 
##                     |   20.956% |   17.338% |           | 
##                     |    3.268  |   -3.337  |           | 
## --------------------|-----------|-----------|-----------|
##               Nueve |       19  |       43  |       62  | 
##                     |   31.646  |   30.354  |           | 
##                     |    5.054  |    5.269  |           | 
##                     |   30.645% |   69.355% |    0.590% | 
##                     |    0.354% |    0.836% |           | 
##                     |    0.181% |    0.409% |           | 
##                     |   -2.248  |    2.295  |           | 
## --------------------|-----------|-----------|-----------|
##                 One |       14  |       12  |       26  | 
##                     |   13.271  |   12.729  |           | 
##                     |    0.040  |    0.042  |           | 
##                     |   53.846% |   46.154% |    0.248% | 
##                     |    0.261% |    0.233% |           | 
##                     |    0.133% |    0.114% |           | 
##                     |    0.200  |   -0.204  |           | 
## --------------------|-----------|-----------|-----------|
##               Seven |      121  |      205  |      326  | 
##                     |  166.399  |  159.601  |           | 
##                     |   12.386  |   12.914  |           | 
##                     |   37.117% |   62.883% |    3.104% | 
##                     |    2.257% |    3.987% |           | 
##                     |    1.152% |    1.952% |           | 
##                     |   -3.519  |    3.594  |           | 
## --------------------|-----------|-----------|-----------|
##                 Six |      377  |      575  |      952  | 
##                     |  485.925  |  466.075  |           | 
##                     |   24.417  |   25.457  |           | 
##                     |   39.601% |   60.399% |    9.064% | 
##                     |    7.032% |   11.182% |           | 
##                     |    3.589% |    5.475% |           | 
##                     |   -4.941  |    5.045  |           | 
## --------------------|-----------|-----------|-----------|
##                 Ten |       17  |       27  |       44  | 
##                     |   22.459  |   21.541  |           | 
##                     |    1.327  |    1.383  |           | 
##                     |   38.636% |   61.364% |    0.419% | 
##                     |    0.317% |    0.525% |           | 
##                     |    0.162% |    0.257% |           | 
##                     |   -1.152  |    1.176  |           | 
## --------------------|-----------|-----------|-----------|
##               Three |     1132  |      836  |     1968  | 
##                     | 1004.518  |  963.482  |           | 
##                     |   16.179  |   16.868  |           | 
##                     |   57.520% |   42.480% |   18.738% | 
##                     |   21.115% |   16.258% |           | 
##                     |   10.778% |    7.960% |           | 
##                     |    4.022  |   -4.107  |           | 
## --------------------|-----------|-----------|-----------|
##      Twelve or more |        8  |       23  |       31  | 
##                     |   15.823  |   15.177  |           | 
##                     |    3.868  |    4.033  |           | 
##                     |   25.806% |   74.194% |    0.295% | 
##                     |    0.149% |    0.447% |           | 
##                     |    0.076% |    0.219% |           | 
##                     |   -1.967  |    2.008  |           | 
## --------------------|-----------|-----------|-----------|
##                 Two |      259  |      199  |      458  | 
##                     |  233.775  |  224.225  |           | 
##                     |    2.722  |    2.838  |           | 
##                     |   56.550% |   43.450% |    4.361% | 
##                     |    4.831% |    3.870% |           | 
##                     |    2.466% |    1.895% |           | 
##                     |    1.650  |   -1.685  |           | 
## --------------------|-----------|-----------|-----------|
##        Column Total |     5361  |     5142  |    10503  | 
##                     |   51.043% |   48.957% |           | 
## --------------------|-----------|-----------|-----------|
## 
##  
## Statistics for All Table Factors
## 
## 
## Pearson's Chi-squared test 
## ------------------------------------------------------------
## Chi^2 =  191.4071     d.f. =  10     p =  9.961248e-36 
## 
## 
##  
##        Minimum expected frequency: 12.72893

#Create your contingency table
mytable<-xtabs(~SCHOOL_NAT+PEOPLE_HOUSE, data=myData)
mytable

##           PEOPLE_HOUSE
## SCHOOL_NAT Eight Five Four Nueve  One Seven  Six  Ten Three Twelve or more  Two
##    PRIVATE    44 1169 2201    19   14   121  377   17  1132              8  259
##    PUBLIC     94 1307 1821    43   12   205  575   27   836             23  199

ctest<-stats::chisq.test(mytable, correct=TRUE)#chi square test
ctest

## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 191.41, df = 10, p-value < 2.2e-16

ctest$expected#expected frequencies

##           PEOPLE_HOUSE
## SCHOOL_NAT    Eight     Five     Four    Nueve      One    Seven      Six
##    PRIVATE 70.43873 1263.814 2052.932 31.64639 13.27107 166.3987 485.9252
##    PUBLIC  67.56127 1212.186 1969.068 30.35361 12.72893 159.6013 466.0748
##           PEOPLE_HOUSE
## SCHOOL_NAT      Ten     Three Twelve or more      Two
##    PRIVATE 22.45873 1004.5176       15.82319 233.7749
##    PUBLIC  21.54127  963.4824       15.17681 224.2251

ctest$observed#observed frequencies

##           PEOPLE_HOUSE
## SCHOOL_NAT Eight Five Four Nueve  One Seven  Six  Ten Three Twelve or more  Two
##    PRIVATE    44 1169 2201    19   14   121  377   17  1132              8  259
##    PUBLIC     94 1307 1821    43   12   205  575   27   836             23  199

ctest$p.value

## [1] 9.961248e-36

#effect size
sjstats::phi(mytable)

## [1] 0.1349965

cramer<-sjstats::cramer(mytable)
cramer

## [1] 0.1349965

introducing Dummy variable:

myData$SCHOOL_NAT=ifelse(myData$SCHOOL_NAT== "PRIVATE", 0, ifelse(myData$SCHOOL_NAT == "PUBLIC", 1, NA))
myData$GENDER=ifelse(myData$GENDER== "M", 0, ifelse(myData$GENDER == "F", 1, NA))

#Homoscedasticity test#

##scatterplot
#aes(x,y)
scatter <- ggplot(myData, aes(SCHOOL_NAT, MAT_S11))
#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "school nature", y = "total grades in mathematics")

## `geom_smooth()` using formula 'y ~ x'

#Simple scatterplot 
#aes(x,y)
scatter <- ggplot(myData, aes(SCHOOL_NAT, CR_S11))
#Add a regression line
scatter + geom_point() + geom_smooth(method = "lm", colour = "Red", se = F) + labs(x = "school nature", y = "total grades in critical reading")

## `geom_smooth()` using formula 'y ~ x'

##correlation test##

stats::cor.test( myData$SCHOOL_NAT, myData$CR_S11,method='spearman')

## Warning in cor.test.default(myData$SCHOOL_NAT, myData$CR_S11, method =
## "spearman"): Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  myData$SCHOOL_NAT and myData$CR_S11
## S = 2.3082e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.1952958

stats::cor.test( myData$SCHOOL_NAT, myData$MAT_S11,method='spearman')

## Warning in cor.test.default(myData$SCHOOL_NAT, myData$MAT_S11, method =
## "spearman"): Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  myData$SCHOOL_NAT and myData$MAT_S11
## S = 2.3673e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## -0.2259508

Independet_project_analysis

Dorothy

06/01/2021

R Markdown

Checking normality

difference test

introducing Dummy variable: