This project analyzes an ASD screening dataset. The main goal is to explore screening scores and whether age and gender can predict autism screening results. My obvious hypothesis is that demographics does not predict ASD scores.
ASD <- read.csv("Autism.csv", stringsAsFactors = FALSE)
asd <- data.frame(lapply(ASD, function(x) {
if (is.character(x)) {
x[x == "?"] <- NA
}
return(x)
}))
asd$age <- as.numeric(asd$age)
asd$gender <- as.factor(asd$gender)
asd$ethnicity <- as.factor(asd$ethnicity)
asd$relation <- as.factor(asd$relation)
names(asd)## [1] "A1_Score" "A2_Score" "A3_Score" "A4_Score"
## [5] "A5_Score" "A6_Score" "A7_Score" "A8_Score"
## [9] "A9_Score" "A10_Score" "age" "gender"
## [13] "ethnicity" "jundice" "austim" "contry_of_res"
## [17] "used_app_before" "result" "age_desc" "relation"
## [21] "Class.ASD"
## [1] 4.875
## [1] 2.501493
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 3.000 4.000 4.875 7.000 10.000
##
## f m
## 337 367
##
## f m
## 47.86932 52.13068
ggplot(asd, aes(x = result)) +
geom_histogram(bins = 10, fill = "#69b3a2", color = "black") +
labs(
title = "Distribution of Autism Screening Scores",
x = "Screening Score",
y = "Number of Participants"
) +
theme_minimal()
## Age and Screening Score
ggplot(asd, aes(x = age, y = result, color = gender)) +
geom_jitter(width = 0.5, height = 0.1, alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Age and Screening Score by Gender (Raw Data)",
x = "Age",
y = "Screening Score",
color = "Gender"
) +
theme_minimal()
## Screening Score by Gender
ggplot(asd, aes(x = gender, y = result, fill = gender)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Screening Score by Gender",
x = "Gender",
y = "Screening Score"
) +
theme_minimal() +
theme(legend.position = "none")
# Gaussian Regression Model
model_gaussian <- glm(
result ~ age + gender + ethnicity,
family = gaussian,
data = asd
)
summary(model_gaussian)##
## Call:
## glm(formula = result ~ age + gender + ethnicity, family = gaussian,
## data = asd)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.230552 0.312883 13.521 < 2e-16 ***
## age -0.006909 0.005767 -1.198 0.23138
## genderm -0.125310 0.194571 -0.644 0.51980
## ethnicity'South Asian' -0.218975 0.465548 -0.470 0.63827
## ethnicityAsian 0.308919 0.325967 0.948 0.34367
## ethnicityBlack 1.361555 0.437950 3.109 0.00197 **
## ethnicityHispanic 1.172436 0.703215 1.667 0.09599 .
## ethnicityLatino 2.391893 0.583432 4.100 4.71e-05 ***
## ethnicityothers 1.143491 2.378906 0.481 0.63092
## ethnicityOthers 1.399170 0.497307 2.813 0.00506 **
## ethnicityPasifika -0.118917 0.745157 -0.160 0.87326
## ethnicityTurkish 0.376326 0.996208 0.378 0.70574
## ethnicityWhite-European 2.091270 0.294797 7.094 3.71e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5.588069)
##
## Null deviance: 3869.4 on 608 degrees of freedom
## Residual deviance: 3330.5 on 596 degrees of freedom
## (95 пропущенных наблюдений удалены)
## AIC: 2791
##
## Number of Fisher Scoring iterations: 2
## [1] 0.139269
Participants with age above 100 were removed :)
asd_clean <- asd[asd$age < 100, ]
model_clean <- glm(
result ~ age + gender + ethnicity,
family = gaussian,
data = asd_clean
)
summary(model_clean)##
## Call:
## glm(formula = result ~ age + gender + ethnicity, family = gaussian,
## data = asd_clean)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.146600 0.385930 10.744 < 2e-16 ***
## age -0.003671 0.010442 -0.352 0.72526
## genderm -0.124987 0.194713 -0.642 0.52118
## ethnicity'South Asian' -0.218632 0.465886 -0.469 0.63904
## ethnicityAsian 0.303785 0.326495 0.930 0.35252
## ethnicityBlack 1.347769 0.439830 3.064 0.00228 **
## ethnicityHispanic 1.153001 0.705660 1.634 0.10280
## ethnicityLatino 2.383210 0.584320 4.079 5.15e-05 ***
## ethnicityothers 1.110557 2.382272 0.466 0.64126
## ethnicityOthers 1.390994 0.498151 2.792 0.00540 **
## ethnicityPasifika -0.075547 0.754750 -0.100 0.92030
## ethnicityTurkish 0.371021 0.997030 0.372 0.70993
## ethnicityWhite-European 2.067818 0.301668 6.855 1.79e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5.596158)
##
## Null deviance: 3852.7 on 607 degrees of freedom
## Residual deviance: 3329.7 on 595 degrees of freedom
## (95 пропущенных наблюдений удалены)
## AIC: 2787.3
##
## Number of Fisher Scoring iterations: 2
ggplot(asd_clean, aes(x = age, y = result, color = gender)) +
geom_jitter(width = 0.5, height = 0.1, alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Age and Screening Score by Gender (After Cleaning)",
x = "Age",
y = "Screening Score",
color = "Gender"
) +
theme_minimal()
# Logistic Regression Model
asd$ASD_binary <- ifelse(asd$Class.ASD == "YES", 1, 0)
model_logistic <- glm(
ASD_binary ~ result + age + gender,
family = binomial,
data = asd
)
summary(model_logistic)##
## Call:
## glm(formula = ASD_binary ~ result + age + gender, family = binomial,
## data = asd)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.883e+02 4.721e+04 -0.006 0.995
## result 4.432e+01 7.052e+03 0.006 0.995
## age 9.603e-03 3.674e+02 0.000 1.000
## genderm -1.834e-01 7.068e+03 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8.1782e+02 on 701 degrees of freedom
## Residual deviance: 5.9598e-08 on 698 degrees of freedom
## (2 пропущенных наблюдений удалены)
## AIC: 8
##
## Number of Fisher Scoring iterations: 25
gam_model <- gam(
result ~ s(age) + gender + ethnicity,
data = asd_clean,
family = gaussian
)
summary(gam_model)##
## Family: gaussian
## Link function: identity
##
## Formula:
## result ~ s(age) + gender + ethnicity
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.03780 0.27410 14.731 < 2e-16 ***
## genderm -0.12499 0.19471 -0.642 0.52118
## ethnicity'South Asian' -0.21863 0.46589 -0.469 0.63904
## ethnicityAsian 0.30378 0.32649 0.930 0.35252
## ethnicityBlack 1.34777 0.43983 3.064 0.00228 **
## ethnicityHispanic 1.15300 0.70566 1.634 0.10280
## ethnicityLatino 2.38321 0.58432 4.079 5.15e-05 ***
## ethnicityothers 1.11056 2.38227 0.466 0.64126
## ethnicityOthers 1.39099 0.49815 2.792 0.00540 **
## ethnicityPasifika -0.07555 0.75475 -0.100 0.92030
## ethnicityTurkish 0.37102 0.99703 0.372 0.70993
## ethnicityWhite-European 2.06782 0.30167 6.855 1.79e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 1 1 0.124 0.725
##
## R-sq.(adj) = 0.118 Deviance explained = 13.6%
## GCV = 5.7184 Scale est. = 5.5962 n = 608
The analysis explored autism screening scores using descriptive statistics, Gaussian, and a GAM model. As a result, there is no significant relationship between age/gender and ASD scores. As was recommended by the professor, I did jitter for my age VS screening score scatterplots. I also tried to add one visual and all of the important data.