## Tie recriprocity function
# This function calculates reciprocity among participants for whom we have complete data
# i.e., only among ties who have the potential to reciprocate because they were in our study and completed the questionnaire
tieReciprocated <- function(edgelist) {
completed <- unique(edgelist$from)
tierecip <- rep(NA,nrow(edgelist))
for (i in 1:nrow(edgelist)){
if(is.na(edgelist[i,]$to) | is.na(tierecip[i])==FALSE) next
if(!(edgelist[i,]$to %in% completed)) next
match <- subset(edgelist, to==edgelist[i,]$from & from==edgelist[i,]$to)
tierecip[i] <- ifelse(nrow(match)==1, 1, 0)
}
tierecip
}
## Function to rescale data to 0-1
rescale01 <- function(x){(x - min(x, na.rm = T))/(max(x, na.rm = T) - min(x, na.rm = T))}
wave1 <- w1 # Keeping another version here for safe measure
# wave3 <- w3 # " " And here
# summary(w1$H1FS6) # Wave 1 depression: eyeballing descriptives
# w1 <- wave1 %>% rename(depression = H1FS6) # working re-code as I decide upon variables
# ncol(w1)
# summary(w3$H3SP9) # Wave 3 depression: eyeballing descriptives
# w3 <- wave3 %>% rename(depression = H3SP9)
# ncol(w3)
# summary(w3num$H3SP9)
add <- #merge(
w1#, w3, by = "AID", all = T) # since w1 has 3238 columns, the 1st 3248 items in add will be w1; the rest--w3
nrow(edge2 <- edgelists[[2]])
[1] 447
friend2 <- edge2
frlong2 <- friend2 #[1:(nrow(friend2)/2),] # first half of nominations
frlongT <- frlong2 # duplicating a parallel to frlong2
frlongT$from <- frlongT$to
nrow(frlongT) # I like to keep checking that the row number is always what it should be and that I didn't make a mistake in that regard.
[1] 447
#frlongT$weight <- friend2[((nrow(friend2)/2) + 1):nrow(friend2),3] # need to put the correct rac
# if they nominated a Friend but did not give a weight, set weight to minimum value (1)
frlong2$weight[is.na(frlong2$weight) & !(is.na(frlong2$to))] <- 1
att <- b2
(att$PPID <- c(1:nrow(b2)))
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
[18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
[35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
[52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
[69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
[86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
[103] 103 104 105 106
att$from <- c(1:nrow(att))
#save copy of frlong2 with PP attributes
names(att)[6] <- 'from'
head(frlongPPatt1 <- join(frlong2, att))
from to weight sex race grade school gender
1 1 95 5 1 white 9 NA Male
2 1 108 5 1 white 9 NA Male
3 2 30 2 1 white 11 NA Male
4 2 85 2 1 white 11 NA Male
5 3 10 5 1 white 11 NA Male
6 3 21 2 1 white 11 NA Male
nrow(frlongPPatt1)
[1] 447
att2 <- join(frlongPPatt1[ , c("from", "weight")], att)
nrow(att2)
[1] 447
head(frlongPweight <- join(frlongT[ , 1:2], att2))
from to weight sex race grade school gender
1 95 95 1 2 white 9 NA Female
2 95 95 1 2 white 9 NA Female
3 95 95 1 2 white 9 NA Female
4 108 108 6 NA <NA> NA NA <NA>
5 30 30 2 1 white 7 NA Male
6 30 30 4 1 white 7 NA Male
unique(frlongPweight) %>% DT::datatable()
frlongPweight <- frlongPweight[1:447, ] # only taking the rows relevant to the first 447 rows of nominations in the original "from" and "to" columns
head(frlongP <- join(frlongT[ , 1:2], att)) %>% DT::datatable()# append Friends' background information to 1/2 half data
nrow(frlongP)
[1] 447
frlong_final <- frlongPPatt1 # since frlongPPatt1 is the original order, I'm using this as a final template
# calculate homophily measures for each Friend *** note to self, remove what not using before posting ***
# head(frlongP$hom.group <- ifelse(frlongP$Group==frlongPPatt1$Group,1,0))
summary(frlong_final$hom.gend <- ifelse(frlongP$gender==frlongPPatt1$gender,1,0))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.0000 1.0000 0.5315 1.0000 1.0000 18
summary(frlong_final$hom.race <- ifelse(frlongP$race==frlongPPatt1$race,1,0))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 1.0000 1.0000 0.8928 1.0000 1.0000 18
summary(frlong_final$hom.grade <- ifelse(frlongP$grade==frlongPPatt1$grade,1,0))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.0000 0.0000 0.1795 0.0000 1.0000 18
summary(frlong_final$hom.school <- ifelse(frlongP$school==frlongPPatt1$school,1,0))
Mode NA's
logical 447
summary(frlong_final$hom.weight <- ifelse(frlongPweight$weight==frlongPPatt1$weight,1,0))
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.0000 0.0000 0.2438 0.0000 1.0000 4
summary(frlong_final$bff <- ifelse(frlongPPatt1$weight == max(frlongPPatt1$weight), 1, 0)) # People rated the highest in their friend's network will get a flag for best friend "bff"
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.00000 0.04027 0.00000 1.00000
summary(frlongP$bff <- ifelse(frlongPweight$weight == max(frlongPweight$weight, na.rm = T), 1, 0)) # best friends of the nominees
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.0000 0.0000 0.0316 0.0000 1.0000 4
summary(frlong_final$bffrecipbff <- ifelse(frlongP$bff == frlong_final$bff & frlong_final$bff == 1, 1, 0)) # checking if their best friends reciprocate as best friends (highest weight)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 0 0 0
frlong_final %>% DT::datatable()
# write.csv(frlong_final, file = "frlong_data_mining.csv")
# calculate reciprocity (1,0) for each Friend
summary(frlong_final$tierecip <- tieReciprocated(frlongPPatt1)) # if the friend reciprocates the nomination, there will be a 1. Summary of total network reciprocity in this summary.
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0000 0.0000 0.0000 0.4612 1.0000 1.0000 22
attribs_com2 <- comm_att[[2]]
attribs_com2$from <- c(1:nrow(attribs_com2)) # making the ID column
frwide_final <- frlong_final %>% select(everything()) %>%
group_by(from) %>%
summarise(
ave.weight = mean(weight, na.rm = T),
reciproc = mean(tierecip, na.rm = T),
hom.gend = mean(hom.gend, na.rm = T),
hom.race = mean(hom.race, na.rm = T),
hom.grade = mean(hom.grade, na.rm = T),
hom.weight = mean(hom.weight, na.rm = T),
hom.bff = ifelse(sum(bff > 0, na.rm = T), 1, 0),
bffrecipbff = ifelse(sum(bff > 0, na.rm = T), 1, 0))
add_construct <- merge(frwide_final, attribs_com2, by = "from", all = T)
head(add_construct) %>% DT::datatable()
# write.csv(add_construct, file = "data_mining_homophilyrecip.csv") # Just to make sure it's safe
I turned the network into a matrix. If I were to do bunch of these, I’d want to make sparse matrices. I’m using the Igraph package to make a network object and I’m finding the network density (which can range from 0 to 1). The density seems to be very low: only 0.041.
# form T1 Friendship network
frel1 <- subset(frlongPPatt1, select=c("from","to","weight")) # making plain edgelist
frel1$weight <- ifelse(is.na(frel1$to), NA, frel1$weight)
frel1$to <- ifelse(is.na(frel1$to), frel1$from, frel1$to)
frel1 <- unique(frel1)
frlvls1 <- unique(c(as.character(frel1$from), as.character(frel1$to)))
frel1$from <- factor(frel1$from, levels = frlvls1)
frel1$to <- factor(frel1$to, levels = frlvls1)
frel1$weight <- ifelse(frel1$from==frel1$to, 0, frel1$weight) # retaining isolates
frel1 <- na.omit(frel1)
frmat1 <- reshape2::acast(frel1, from~to, value.var="weight", fill=0, drop=F)
frg1 <- igraph::graph.adjacency(frmat1, mode=c("directed"), weighted=T, diag=F)
igraph::graph.density(frg1)
[1] 0.04093407
Below, I’m calculating individual-level centrality!
# calculate Friendship degree/strength *** note to self, remove what not using before posting ***
frout1 <- as.data.frame(igraph::degree(frg1, mode="out"))
frout1$PPID <- row.names(frout1)
names(frout1) <- c("outdeg","PPID")
frin1 <- as.data.frame(igraph::degree(frg1, mode="in"))
frin1$PPID <- row.names(frin1)
names(frin1) <- c("indeg","PPID")
frdeg1 <- as.data.frame(igraph::degree(frg1, mode="all"))
frdeg1$PPID <- row.names(frdeg1)
names(frdeg1) <- c("degree","PPID")
frclose1 <- as.data.frame(igraph::closeness(frg1, mode="all"))
frclose1$PPID <- row.names(frclose1)
names(frclose1) <- c("closeness","PPID")
frbtwn1 <- as.data.frame(igraph::betweenness(frg1, directed = T))
frbtwn1$PPID <- row.names(frbtwn1)
names(frbtwn1) <- c("betweenness","PPID")
freigen1 <- as.data.frame(igraph::evcent(frg1))
freigen1$PPID <- row.names(freigen1)
freigen1 <- freigen1[ , c("vector", "PPID")] # extra step bc eigenvector output is a data frame
names(freigen1) <- c("eigenvector","PPID")
head(freigen1) %>% DT::datatable()
frweightout1 <- as.data.frame(strength(frg1, mode="out"))
frweightout1$PPID <- row.names(frweightout1)
names(frweightout1) <- c("weightout","PPID")
head(frweightout1) %>% DT::datatable()
frweightin1 <- as.data.frame(strength(frg1, mode="in"))
frweightin1$PPID <- row.names(frweightin1)
names(frweightin1) <- c("weightin","PPID")
Here, I’m aggregating all of the centrality scores.
frweightin1 %>% DT::datatable()
join(frout1, frin1) %>%
join(.,frweightout1) %>%
join(.,frweightin1) %>%
join(.,frdeg1) %>%
join(.,frclose1) %>%
join(.,freigen1) -> frnetd1
head(frnetd1) %>% DT::datatable()
frnetd1$from <- c(1:nrow(frnetd1))
Below is the final constructed dataset for the nodes’ network positions that I’ll use with a model to figure out other atttributes.
final_add_construct <- merge(frnetd1, add_construct, by = "from", all = T)
head(final_add_construct) %>% DT::datatable()
new <- final_add_construct
hi <- new
# write.csv(final_add_construct, file = "final_add_construct.csv")
I’m using a variable for having learned about school conflict or not. In the data, there are 1425 no’s and 5055 yes’s. This is a really skewed ratio, which will likely create the same error that happens when companies try to predict if people will buy a product. The vast majority won’t, but we want to see a more normal ratio of no’s and yes’s. Let’s keep that in mind.
names(final_add_construct)
[1] "from" "outdeg" "PPID" "indeg" "weightout"
[6] "weightin" "degree" "closeness" "eigenvector" "ave.weight"
[11] "reciproc" "hom.gend" "hom.race" "hom.grade" "hom.weight"
[16] "hom.bff" "bffrecipbff" "sex" "race" "grade"
[21] "school"
w1 <- w1 %>% select(everything()) %>%
mutate(
indeg = IDGX2, # originally, indegree was coded as this: IDGX2
outdeg = ODGX2,
eigen = BCENT10X,
closeness = IGDMEAN,
hom.race = EHSRC5,
hom.grade = EHSGRD,
grade = H1GI20,
race = H1GI8,
sex = BIO_SEX
)
w1$bffrecipbff = ifelse(w1$BFFRECBF != 0 | w1$BFFRECBF != 0, 1, 0)
w1$hom.bff = ifelse(w1$BMFRECIP != 0 | w1$BFFRECIP != 0, 1, 0) # if your best friend reciprocates at all
w1$schoolconflict <- w1$H1TS14 # eventually I'll predict if my constructed df from the nominations has schoolconflict nodes or not based on network position bc happiness is associated with social support.
w1$schoolconflict[w1$schoolconflict > 1] <- 0 # need to recode the missing values
w1$schoolconflict <- factor(w1$schoolconflict, levels = 0:1, labels = c("No", "Yes"))
summary(as.factor(w1$schoolconflict)) # so, I'm recoding as 0 because I'm assuming (maybe irrationally so) that if they didn't select 1, then they didn't learn about conflict in school, plus, it's only 24 values and makes life simpler.)
No Yes
1449 5055
I’m beginning the partitioning & testing/training process below!
set.seed(212121)
data <- w1[ , c("schoolconflict", "indeg", "outdeg", "eigen",
"closeness", "hom.race", "hom.grade", "grade", "race", "sex")]
in_train <- caret::createDataPartition(data$schoolconflict, p = 3 / 4, list = FALSE)
training <- data[in_train, ]
testing <- data[-in_train, ]
Random Forrest
I’m using the following question because it relates to having social ties at school/network position, and network variables are all I have to work with.
Please tell me whether you have learned about each of the following things in a class at school: How to handle conflict
Random forest is terrible at predicting no’s for this outcome.
library(randomForest)
rf <- randomForest(schoolconflict ~ . , data = training, na.action = na.omit)
rf
Call:
randomForest(formula = schoolconflict ~ ., data = training, na.action = na.omit)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 21.95%
Confusion matrix:
No Yes class.error
No 6 583 0.98981324
Yes 24 2153 0.01102435
Bagged Tree is still terrible at predicting no’s, but gets a couple more correct. There’s a good chance that our outcome variable doesn’t make sense with our network predictors.
bag <- randomForest(schoolconflict ~ ., data = training, na.action = na.omit,
mtry = ncol(training) - 1)
rf
Call:
randomForest(formula = schoolconflict ~ ., data = training, na.action = na.omit)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 21.95%
Confusion matrix:
No Yes class.error
No 6 583 0.98981324
Yes 24 2153 0.01102435
Boosted Tree Yep — my predictors don’t have any bearing on my outcome variable.
library(gbm)
boosted <- gbm(schoolconflict == "yes" ~ ., data = training,
n.cores = parallel::detectCores())
Distribution not specified, assuming bernoulli ...
boosted
gbm(formula = schoolconflict == "yes" ~ ., data = training, n.cores = parallel::detectCores())
A gradient boosted model with bernoulli loss function.
100 iterations were performed.
There were 9 predictors of which 0 had non-zero influence.
Time for testing! We’ll redo them with other outcome variables afterwards. It looks like the random forest model got about 79% correct and bagged regression tree got about 78% correct, and the boosted regression tree is very wonkey (telling me it got 100% of them correct).
testing <- na.omit(testing)
mean(testing$schoolconflict == predict(rf, newdata = testing, type = "class"))
[1] 0.7854856
mean(testing$schoolconflict == predict(bag, newdata = testing, type = "class"))
[1] 0.7822839
mean((testing$schoolconflict == "yes") ==
(predict(boosted, newdata = testing, type = "response", n.trees = 100) > 0.5))
[1] 1
NewX <- testing
NewX$schoolconflict <- NULL
# mean(testing$schoolconflict == predict(bM, new_data = NewX, type = "class"))
Now, I’m going to use a variable that seems more relevant to social relationships. We’re going to predict loneliness based on network position. The distribution of lonely is: 4157 as “never lonely”, 1787 selected “sometimes lonely,” 401 selected “a lot of the time”, and 140 selected “all of the time.”
I’m beginning the partitioning & testing/training process below!
set.seed(212121)
w1$lonely <- w1$H1FS13
data <- w1[ , c("lonely", "indeg", "outdeg", "eigen",
"closeness", "hom.race", "hom.grade", "grade", "race", "sex")]
data$lonely <- ifelse(data$lonely > 0, 1, 0) # here's we're recoding everything that's not "never lonely" as "lonely" because it is a likert scale of never lonely to always lonely. I care if if anyone is EVER consistently lonely.
data$lonely <- factor(data$lonely, levels = 0:1, labels = c("No", "Yes"))
summary(data$lonely)
No Yes
4157 2347
names(data)
[1] "lonely" "indeg" "outdeg" "eigen" "closeness"
[6] "hom.race" "hom.grade" "grade" "race" "sex"
in_train2 <- caret::createDataPartition(data$lonely, p = 3 / 4, list = FALSE)
training2 <- data[in_train, ]
testing2 <- data[-in_train, ]
Random forest is interesting at predicting no’s for this outcome – better than with school conflict. It tends to predict no’s more than yes’s, which is the opposite issue than before!
rf <- randomForest(lonely ~ . , data = training2, na.action = na.omit)
rf
Call:
randomForest(formula = lonely ~ ., data = training2, na.action = na.omit)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 3
OOB estimate of error rate: 35.47%
Confusion matrix:
No Yes class.error
No 1578 237 0.1305785
Yes 744 207 0.7823344
Bagged Tree is pretty much the same as random forest!
bag <- randomForest(lonely ~ ., data = training2, na.action = na.omit, mtry = ncol(training) - 1)
bag
Call:
randomForest(formula = lonely ~ ., data = training2, mtry = ncol(training) - 1, na.action = na.omit)
Type of random forest: classification
Number of trees: 500
No. of variables tried at each split: 9
OOB estimate of error rate: 35.43%
Confusion matrix:
No Yes class.error
No 1538 277 0.1526171
Yes 703 248 0.7392219
Boosted Tree. It looks like my predictors had NO influence on loneliness at school. That seems fishy because the predictors are literally everything about social ties. Yep — my predictors don’t have any bearing on my outcome variable.
library(gbm)
boosted <- gbm(lonely == "yes" ~ ., data = training2,
n.cores = parallel::detectCores())
Distribution not specified, assuming bernoulli ...
boosted
gbm(formula = lonely == "yes" ~ ., data = training2, n.cores = parallel::detectCores())
A gradient boosted model with bernoulli loss function.
100 iterations were performed.
There were 9 predictors of which 0 had non-zero influence.
Time for testing! We’ll redo them with other outcome variables afterwards. It looks like the random forest model got about 79% correct and bagged regression tree got about 78% correct, and the boosted regression tree is very wonkey (telling me it got 100% of them correct). rf didn’t even get half of them right! Somehow, the bagged regression tree got 63% correct– a high. Boosted improvement. Boosted is just wildly 1 again.
testing2 <- na.omit(testing2)
mean(testing2$lonely == predict(rf, newdata = testing2, type = "class"))
[1] 0.638207
mean(testing2$lonely == predict(bag, newdata = testing2, type = "class"))
[1] 0.6339381
mean((testing2$lonely == "yes") ==
(predict(boosted, newdata = testing2, type = "response", n.trees = 100) > 0.5))
[1] 1
NewX <- testing2
NewX$lonely <- NULL
# mean(testing2$lonely == predict(bM, new_data = NewX, type = "class"))
Now, I’m going to try to see how lonely the kids from the UCI edgelists were when they took the interview. I’m only trusting the bagged regression because it looks reasonable. Now I have values for lonely!
new <- na.omit(new)
new$eigenvector <- new$eigen
new$eigen <- new$eigenvector
names(new)
[1] "from" "outdeg" "PPID" "indeg" "weightout"
[6] "weightin" "degree" "closeness" "eigenvector" "ave.weight"
[11] "reciproc" "hom.gend" "hom.race" "hom.grade" "hom.weight"
[16] "hom.bff" "bffrecipbff" "sex" "race" "grade"
[21] "school" "eigen"
new <- final_add_construct
# predict(rf, newdata = new, type = "class")
newdat <- new %>% rename(eigen = eigenvector)
predict(bag, newdata = newdat, type = "class")
1 2 3 <NA> <NA> 6 7 8 9 10 11 12 13 14 15
No No Yes <NA> <NA> Yes Yes No Yes Yes No Yes Yes Yes No
<NA> 17 18 19 20 21 22 <NA> 24 25 26 <NA> 28 29 30
<NA> Yes No Yes Yes Yes No <NA> Yes Yes No <NA> Yes Yes Yes
31 32 33 34 35 36 37 <NA> 39 40 <NA> 42 <NA> 44 45
Yes Yes Yes Yes No No Yes <NA> No No <NA> Yes <NA> No Yes
46 47 48 49 50 51 52 <NA> 54 55 56 57 58 59 60
Yes No No No No Yes Yes <NA> Yes Yes No No Yes Yes Yes
61 62 63 64 65 66 67 68 <NA> 70 71 72 73 74 75
Yes Yes Yes Yes Yes Yes No Yes <NA> No Yes Yes Yes No Yes
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
No Yes Yes Yes No Yes Yes Yes Yes No Yes No Yes Yes Yes
91 92 <NA> <NA> 95 96 97 <NA> 99 <NA> 101 102 103 104 105
Yes Yes <NA> <NA> No Yes Yes <NA> No <NA> Yes Yes No Yes Yes
<NA> <NA> <NA> <NA>
<NA> <NA> <NA> <NA>
Levels: No Yes
predict(boosted, newdata = newdat, type = "response", n.trees = 100 > 0.5)
[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[36] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[106] 0 0 0 0
Social Networks in Data Mining
If you’ve ever worked with social network data and would have killed to get your hands on some interesting attribute variable that just isn’t available with the dataset, then you’re in the same boat that I’m currently sailing. I have the first through fourth waves of the publicly available data from the National Longitudinal Study of Adolescent Health (ADD Health) and I’m most interested in adolescent social ties as peer networks. The problem is, I can’t seem to get access to the raw, participant-ID-linked nomination data, even after gaining access to the study’s restricted files. Inconveniently, I can’t use my restricted access for this project because my access is limited to use of the data within a secure server, or as Columbia calls it– the “Secure Data Enclave.” Despite these rather unavoidable obstacles, the work I’ll be doing for this project may give me a bit of insight into the network-driven questions I have. Here’s why:
While scouring the web for neat datasets, I stumbled upon the UC Irvine Network Database. I scrolled through the mini matrix dataset after matrix dataset, through zebra nominations, monkey social dominance interactions, dolphin networks, and networks of food-web data; while intriguing, these little undirected, binary network files weren’t part of my project-plan. But then, there it was: an 84-edgelist file set of the ADD Health raw nominations! If I learn nothing else from this project endeavor, I’ll at least have these files safe in my hard-drive for future exploration. However, to my chagrin, neither the nomination files, nor the attribute files, contained ANY linkage to the original ADD Health participant IDs (AIDs as they call them). Thus, I can’t conclusively link these thousands of friendship nominations to any of the nominators’ or nominees’ data from their separate multi-hour in-home questionnaires (Waves 1 through 4 of the official ADD Health public datasets available under contract here: http://www.cpc.unc.edu/projects/addhealth/contracts). But given this project isn’t guided by the academia-driven discouragement of exploratory analysis or any ethically-oriented analytical pre-registration, I intend to get creative.
My ADD Health Wave I-IV files from in-home structured interviews include TONS of variables: 5069 to be exact! (calculated below). They’re all about adolescent participants’ home lives, school lives, attitudes, affective & physical development, physiological tests, grades, friendship, and more. The ADD Health codebooks can be found here: http://www.cpc.unc.edu/projects/addhealth/documentation/ace So, I’m going to use my (un)supervised learning skills that I learned from my super-strategic data mining course to explore ways of potentially predicting various attributes of the un-identified participants in the raw-nominations data by including as many network-structural variable calculations as I can. Since the ADD Health Wave I-IV data has a large sample of pre-measured network characteristics like individual centrality, reciprocity, homophily, etc., I can calculate a bunch of these (and use the minimal attributes available alongside the raw-nomination edgelists) to concoct a function to predict other attributes, like participants’ sense of belonging at school, reports of participation in miscellaneous activities, and more. I’ll train my models on attribute/network variables available in my Wave I-IV datasets and predict who has what qualities based on their network position in the raw nomination data. I’ll have no way at this time to know if my models ACTUALLY end up predicting well on the true/intended data, but I’ll get a feel for their performance based upon the training and testing phase independent of
Reading in my ADD Health waves data
Below, I’m scraping raw network nomination data from UC Irvine’s network database. The are 84 files for 84 different communities. Each community has either one or two junior high schools and this nomination data comes from peer nominations within these schools. Students were permitted to nominate names of their friends from a roster of the names of the other students at their school, as well as their community’s “sister” school–if their respective community had a sister school–otherwise, students were instructed to nominate only friends from their own school.
The variables are all coded pretty abstractly, so I have to recode them to have some meaning.
Clearly the datasets have rather abstract names. We’ll deal with that soon.
The variables are all coded pretty abstractly, so I have to recode them to have some meaning.
Gender is coded as 1 = male; 2 = female. Race is coded as 1 = white, 2 = black, 3 = latinx, 4 = asian, 5 = mixed/other
This specific matrix seems like it’s coded differently than how it’s supposed to. I’m going to skip to edgelist number two.