df = read.csv('https://www.ebi.ac.uk/~kola/wtsi_impc_xry_034_001_male_female_investigations_scored.csv')
#par(mfrow=c(2,1))
hist(df$female_score,col=2)

hist(df$male_score,col=3)

hist(df$female_score+df$male_score,col=2)

hist(df$female_score-df$male_score,col=2)

hist(df$male_score[df$biological_sample_group %in% 'control' & df$sex %in% 'male'],col=5)

hist(df$male_score[df$biological_sample_group %in% 'control' & df$sex %in% 'female'],col=4)

pairs(df[,c('male_score','female_score','weight')])

library(MASS)
lm1=stepAIC(lm(male_score~biological_sample_group+sex+weight,data = df),direction = 'both',k = log(nrow(df)))
## Start:  AIC=18649.9
## male_score ~ biological_sample_group + sex + weight
## 
##                           Df Sum of Sq    RSS   AIC
## <none>                                  53782 18650
## - biological_sample_group  1       164  53945 18681
## - weight                   1      9677  63459 20853
## - sex                      1    133811 187592 35350
summary(lm1)
## 
## Call:
## lm(formula = male_score ~ biological_sample_group + sex + weight, 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0410  -1.3515  -0.0007   1.3539  11.6428 
## 
## Coefficients:
##                                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)                         -12.026227   0.155133 -77.522  < 2e-16
## biological_sample_groupexperimental   0.265123   0.041568   6.378 1.85e-10
## sexmale                               8.332745   0.045685 182.394  < 2e-16
## weight                                0.285814   0.005827  49.050  < 2e-16
##                                        
## (Intercept)                         ***
## biological_sample_groupexperimental ***
## sexmale                             ***
## weight                              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.006 on 13371 degrees of freedom
##   (39 observations deleted due to missingness)
## Multiple R-squared:   0.86,  Adjusted R-squared:  0.8599 
## F-statistic: 2.737e+04 on 3 and 13371 DF,  p-value: < 2.2e-16
plot(lm1)

hist(resid(lm1),col=2)

plot(density(resid(lm1)),col=2,lwd=5)
lines(density(rnorm(
  n = 100000,
  mean = mean(resid(lm1)),
  sd = sd(resid(lm1))
)),col=1,lwd=1)

plot(y=df$male_score,df$weight,col=as.integer(df$misclassfied_flag))