df = read.csv('https://www.ebi.ac.uk/~kola/wtsi_impc_xry_034_001_male_female_investigations_scored.csv')
#par(mfrow=c(2,1))
hist(df$female_score,col=2)

hist(df$male_score,col=3)

hist(df$female_score+df$male_score,col=2)

hist(df$female_score-df$male_score,col=2)

hist(df$male_score[df$biological_sample_group %in% 'control' & df$sex %in% 'male'],col=5)

hist(df$male_score[df$biological_sample_group %in% 'control' & df$sex %in% 'female'],col=4)

pairs(df[,c('male_score','female_score','weight')])

library(MASS)
lm1=stepAIC(lm(male_score~biological_sample_group+sex+weight,data = df),direction = 'both',k = log(nrow(df)))
## Start: AIC=18649.9
## male_score ~ biological_sample_group + sex + weight
##
## Df Sum of Sq RSS AIC
## <none> 53782 18650
## - biological_sample_group 1 164 53945 18681
## - weight 1 9677 63459 20853
## - sex 1 133811 187592 35350
summary(lm1)
##
## Call:
## lm(formula = male_score ~ biological_sample_group + sex + weight,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0410 -1.3515 -0.0007 1.3539 11.6428
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.026227 0.155133 -77.522 < 2e-16
## biological_sample_groupexperimental 0.265123 0.041568 6.378 1.85e-10
## sexmale 8.332745 0.045685 182.394 < 2e-16
## weight 0.285814 0.005827 49.050 < 2e-16
##
## (Intercept) ***
## biological_sample_groupexperimental ***
## sexmale ***
## weight ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.006 on 13371 degrees of freedom
## (39 observations deleted due to missingness)
## Multiple R-squared: 0.86, Adjusted R-squared: 0.8599
## F-statistic: 2.737e+04 on 3 and 13371 DF, p-value: < 2.2e-16
plot(lm1)




hist(resid(lm1),col=2)

plot(density(resid(lm1)),col=2,lwd=5)
lines(density(rnorm(
n = 100000,
mean = mean(resid(lm1)),
sd = sd(resid(lm1))
)),col=1,lwd=1)

plot(y=df$male_score,df$weight,col=as.integer(df$misclassfied_flag))
