Introduction

This project analyzes an ASD screening dataset. The main goal is to explore screening scores and whether age and gender can predict autism screening results. My obvious hypothesis is that demographics does not predict ASD scores.

Data Cleaning

ASD <- read.csv("Autism.csv", stringsAsFactors = FALSE)

asd <- data.frame(lapply(ASD, function(x) {
  if (is.character(x)) {
    x[x == "?"] <- NA
  }
  return(x)
}))


asd$age <- as.numeric(asd$age)
asd$gender <- as.factor(asd$gender)
asd$ethnicity <- as.factor(asd$ethnicity)
asd$relation <- as.factor(asd$relation)

names(asd)
##  [1] "A1_Score"        "A2_Score"        "A3_Score"        "A4_Score"       
##  [5] "A5_Score"        "A6_Score"        "A7_Score"        "A8_Score"       
##  [9] "A9_Score"        "A10_Score"       "age"             "gender"         
## [13] "ethnicity"       "jundice"         "austim"          "contry_of_res"  
## [17] "used_app_before" "result"          "age_desc"        "relation"       
## [21] "Class.ASD"

Descriptive Statistics

mean(asd$result, na.rm = TRUE)
## [1] 4.875
sd(asd$result, na.rm = TRUE)
## [1] 2.501493
summary(asd$result)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   3.000   4.000   4.875   7.000  10.000

Gender Distribution

table(asd$gender)
## 
##   f   m 
## 337 367
prop.table(table(asd$gender)) * 100
## 
##        f        m 
## 47.86932 52.13068

ASD Class Distribution

table(asd$Class.ASD)
## 
##  NO YES 
## 515 189
prop.table(table(asd$Class.ASD)) * 100
## 
##       NO      YES 
## 73.15341 26.84659

Visualizations

Distribution of Autism Screening Scores

ggplot(asd, aes(x = result)) +
  geom_histogram(bins = 10, fill = "#69b3a2", color = "black") +
  labs(
    title = "Distribution of Autism Screening Scores",
    x = "Screening Score",
    y = "Number of Participants"
  ) +
  theme_minimal()

## Age and Screening Score

ggplot(asd, aes(x = age, y = result, color = gender)) +
  geom_jitter(width = 0.5, height = 0.1, alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Age and Screening Score by Gender (Raw Data)",
    x = "Age",
    y = "Screening Score",
    color = "Gender"
  ) +
  theme_minimal()

## Screening Score by Gender

ggplot(asd, aes(x = gender, y = result, fill = gender)) +
  geom_boxplot(alpha = 0.7) +
  labs(
    title = "Screening Score by Gender",
    x = "Gender",
    y = "Screening Score"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

# Gaussian Regression Model

model_gaussian <- glm(
  result ~ age + gender + ethnicity,
  family = gaussian,
  data = asd
)

summary(model_gaussian)
## 
## Call:
## glm(formula = result ~ age + gender + ethnicity, family = gaussian, 
##     data = asd)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              4.230552   0.312883  13.521  < 2e-16 ***
## age                     -0.006909   0.005767  -1.198  0.23138    
## genderm                 -0.125310   0.194571  -0.644  0.51980    
## ethnicity'South Asian'  -0.218975   0.465548  -0.470  0.63827    
## ethnicityAsian           0.308919   0.325967   0.948  0.34367    
## ethnicityBlack           1.361555   0.437950   3.109  0.00197 ** 
## ethnicityHispanic        1.172436   0.703215   1.667  0.09599 .  
## ethnicityLatino          2.391893   0.583432   4.100 4.71e-05 ***
## ethnicityothers          1.143491   2.378906   0.481  0.63092    
## ethnicityOthers          1.399170   0.497307   2.813  0.00506 ** 
## ethnicityPasifika       -0.118917   0.745157  -0.160  0.87326    
## ethnicityTurkish         0.376326   0.996208   0.378  0.70574    
## ethnicityWhite-European  2.091270   0.294797   7.094 3.71e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.588069)
## 
##     Null deviance: 3869.4  on 608  degrees of freedom
## Residual deviance: 3330.5  on 596  degrees of freedom
##   (95 пропущенных наблюдений удалены)
## AIC: 2791
## 
## Number of Fisher Scoring iterations: 2
r_squared <- 1 - model_gaussian$deviance / model_gaussian$null.deviance
r_squared
## [1] 0.139269

Model Diagnostics

sim_res <- simulateResiduals(fittedModel = model_gaussian)
plot(sim_res)

Cleaned Model

Participants with age above 100 were removed :)

asd_clean <- asd[asd$age < 100, ]

model_clean <- glm(
  result ~ age + gender + ethnicity,
  family = gaussian,
  data = asd_clean
)
summary(model_clean)
## 
## Call:
## glm(formula = result ~ age + gender + ethnicity, family = gaussian, 
##     data = asd_clean)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              4.146600   0.385930  10.744  < 2e-16 ***
## age                     -0.003671   0.010442  -0.352  0.72526    
## genderm                 -0.124987   0.194713  -0.642  0.52118    
## ethnicity'South Asian'  -0.218632   0.465886  -0.469  0.63904    
## ethnicityAsian           0.303785   0.326495   0.930  0.35252    
## ethnicityBlack           1.347769   0.439830   3.064  0.00228 ** 
## ethnicityHispanic        1.153001   0.705660   1.634  0.10280    
## ethnicityLatino          2.383210   0.584320   4.079 5.15e-05 ***
## ethnicityothers          1.110557   2.382272   0.466  0.64126    
## ethnicityOthers          1.390994   0.498151   2.792  0.00540 ** 
## ethnicityPasifika       -0.075547   0.754750  -0.100  0.92030    
## ethnicityTurkish         0.371021   0.997030   0.372  0.70993    
## ethnicityWhite-European  2.067818   0.301668   6.855 1.79e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5.596158)
## 
##     Null deviance: 3852.7  on 607  degrees of freedom
## Residual deviance: 3329.7  on 595  degrees of freedom
##   (95 пропущенных наблюдений удалены)
## AIC: 2787.3
## 
## Number of Fisher Scoring iterations: 2

Age and Screening Score After Cleaning

ggplot(asd_clean, aes(x = age, y = result, color = gender)) +
  geom_jitter(width = 0.5, height = 0.1, alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "Age and Screening Score by Gender (After Cleaning)",
    x = "Age",
    y = "Screening Score",
    color = "Gender"
  ) +
  theme_minimal()

# Logistic Regression Model

asd$ASD_binary <- ifelse(asd$Class.ASD == "YES", 1, 0)

model_logistic <- glm(
  ASD_binary ~ result + age + gender,
  family = binomial,
  data = asd
)

summary(model_logistic)
## 
## Call:
## glm(formula = ASD_binary ~ result + age + gender, family = binomial, 
##     data = asd)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.883e+02  4.721e+04  -0.006    0.995
## result       4.432e+01  7.052e+03   0.006    0.995
## age          9.603e-03  3.674e+02   0.000    1.000
## genderm     -1.834e-01  7.068e+03   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8.1782e+02  on 701  degrees of freedom
## Residual deviance: 5.9598e-08  on 698  degrees of freedom
##   (2 пропущенных наблюдений удалены)
## AIC: 8
## 
## Number of Fisher Scoring iterations: 25

GAM Model

gam_model <- gam(
  result ~ s(age) + gender + ethnicity,
  data = asd_clean,
  family = gaussian
)

summary(gam_model)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## result ~ s(age) + gender + ethnicity
## 
## Parametric coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              4.03780    0.27410  14.731  < 2e-16 ***
## genderm                 -0.12499    0.19471  -0.642  0.52118    
## ethnicity'South Asian'  -0.21863    0.46589  -0.469  0.63904    
## ethnicityAsian           0.30378    0.32649   0.930  0.35252    
## ethnicityBlack           1.34777    0.43983   3.064  0.00228 ** 
## ethnicityHispanic        1.15300    0.70566   1.634  0.10280    
## ethnicityLatino          2.38321    0.58432   4.079 5.15e-05 ***
## ethnicityothers          1.11056    2.38227   0.466  0.64126    
## ethnicityOthers          1.39099    0.49815   2.792  0.00540 ** 
## ethnicityPasifika       -0.07555    0.75475  -0.100  0.92030    
## ethnicityTurkish         0.37102    0.99703   0.372  0.70993    
## ethnicityWhite-European  2.06782    0.30167   6.855 1.79e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##        edf Ref.df     F p-value
## s(age)   1      1 0.124   0.725
## 
## R-sq.(adj) =  0.118   Deviance explained = 13.6%
## GCV = 5.7184  Scale est. = 5.5962    n = 608

GAM Plot

plot(gam_model, se = TRUE, col = "blue")

# Model Comparison

AIC(model_clean, gam_model)
##             df      AIC
## model_clean 14 2787.313
## gam_model   14 2787.313

Conclusion

The analysis explored autism screening scores using descriptive statistics, Gaussian, and a GAM model. As a result, there is no significant relationship between age/gender and ASD scores. As was recommended by the professor, I did jitter for my age VS screening score scatterplots. I also tried to add one visual and all of the important data.