This document describes several exploratory steps on analyzing the social network data for circle discovery. The problem is posted as a Kaggle competition for learning social cirle in networks. The original “features.txt” data file has been transformed into a CSV file called “features.csv” using a Java program - MakeCSVFromFeatures.java. The Github repository containing all the source code is located here.
[1] J. McAuley, J. Leskovec. Learning to discover social circles in ego networks. Neural Information Processing Systems. 2012.
Read in all the egos along with their profile features
egos <- read.csv("features.csv")
str(egos)
## 'data.frame': 27520 obs. of 58 variables:
## $ id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ birthday : int 0 1 2 3 4 NA 5 6 7 NA ...
## $ education.classes.description: int NA NA NA NA NA NA NA NA NA NA ...
## $ education.classes.from.id : int NA 0 NA NA NA NA NA NA NA NA ...
## $ education.classes.from.name : int NA 0 NA NA NA NA NA NA NA NA ...
## $ education.classes.id : int NA 0 2 NA NA NA NA NA NA NA ...
## $ education.classes.name : int NA 0 2 NA NA NA NA NA NA NA ...
## $ education.classes.with.id : int NA 0 NA NA NA NA NA NA NA NA ...
## $ education.classes.with.name : int NA 0 NA NA NA NA NA NA NA NA ...
## $ education.concentration.id : int 0 2 NA 3 4 NA NA 5 6 NA ...
## $ education.concentration.name : int 0 2 NA 3 4 NA NA 5 6 NA ...
## $ education.degree.id : int NA NA NA NA NA NA NA NA NA NA ...
## $ education.degree.name : int NA NA NA NA NA NA NA NA NA NA ...
## $ education.school.id : int 1 3 4 1 1 NA 3 1 1 NA ...
## $ education.school.name : int 1 3 4 1 1 NA 3 1 1 NA ...
## $ education.type : int 1 1 1 1 1 NA 1 1 1 NA ...
## $ education.with.id : int NA NA NA NA NA NA NA NA NA NA ...
## $ education.with.name : int NA NA NA NA NA NA NA NA NA NA ...
## $ education.year.id : int 1 3 1 1 1 NA 0 1 1 NA ...
## $ education.year.name : int 1 3 1 1 1 NA 0 1 1 NA ...
## $ first_name : int 0 1 2 3 4 5 6 7 8 9 ...
## $ gender : int 0 1 1 1 0 0 1 1 1 1 ...
## $ hometown.id : int 0 1 2 3 4 NA 5 NA 2 NA ...
## $ hometown.name : int 0 1 2 3 4 NA 5 NA 2 NA ...
## $ id.1 : int 0 1 2 3 4 5 6 7 8 9 ...
## $ languages.id : int NA NA NA 1 2 NA NA 2 NA NA ...
## $ languages.name : int NA NA NA 1 2 NA NA 2 NA NA ...
## $ last_name : int 0 1 2 3 4 5 6 7 8 9 ...
## $ locale : int 0 0 0 0 0 0 0 0 0 0 ...
## $ location : int NA NA NA NA NA NA NA NA NA NA ...
## $ location.id : int 0 1 2 3 4 NA 5 0 0 NA ...
## $ location.name : int 0 1 2 3 4 NA 5 0 0 NA ...
## $ middle_name : int NA NA NA NA 0 NA NA NA NA NA ...
## $ name : int 0 1 2 3 4 5 6 7 8 9 ...
## $ political : int NA NA NA NA NA NA NA NA NA NA ...
## $ religion : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.description : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.employer.id : int NA NA 1 2 3 NA NA 8 9 NA ...
## $ work.employer.name : int NA NA 1 2 3 NA NA 8 9 NA ...
## $ work.end_date : int NA NA 1 NA NA NA NA 3 4 NA ...
## $ work.from.id : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.from.name : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.location.id : int NA NA 0 NA NA NA NA 1 1 NA ...
## $ work.location.name : int NA NA 0 NA NA NA NA 1 1 NA ...
## $ work.position.id : int NA NA 1 NA 2 NA NA 3 6 NA ...
## $ work.position.name : int NA NA 1 NA 2 NA NA 3 0 NA ...
## $ work.projects.description : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.end_date : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.from.id : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.from.name : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.id : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.name : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.start_date : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.with.id : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.projects.with.name : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.start_date : int NA NA 1 2 3 NA NA 6 4 NA ...
## $ work.with.id : int NA NA NA NA NA NA NA NA NA NA ...
## $ work.with.name : int NA NA NA NA NA NA NA NA NA NA ...
Read in the sample_submissio.csv and extract the list of egos for prediction
sample_sub <- read.csv("sample_submission.csv")
sample_egos <- sample_sub[[1]]
str(sample_egos)
## int [1:50] 25708 2473 18844 19268 25283 21869 17748 5744 3656 17002 ...
In this clustering step, we only use the profile features for clustering friends into cricles.
first remove some columns such as birthday, names…
egos <- subset(egos, select = -c(birthday, first_name))
Extract an ego’s friends’ profile features
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
ego <- sample_egos[2]
ego_friends_file <- paste("egonets/", ego, ".egonet", sep ="")
lines <- readLines(ego_friends_file)
ego_friends <- unlist(lapply(lines, function(x) {strsplit(x, ":")[[1]][[1]]}))
ego_friends <- as.integer(ego_friends)
str(ego_friends)
## int [1:156] 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 ...
ego_friends_features <- egos[egos$id %in% ego_friends, ]
I want to view several rows of the data in a nice format.
tbl <- tbl_df(ego_friends_features)
head(tbl)
## Source: local data frame [6 x 56]
##
## id education.classes.description education.classes.from.id
## 2475 2474 NA NA
## 2476 2475 NA NA
## 2477 2476 NA NA
## 2478 2477 NA NA
## 2479 2478 NA NA
## 2480 2479 NA NA
## Variables not shown: education.classes.from.name (int),
## education.classes.id (int), education.classes.name (int),
## education.classes.with.id (int), education.classes.with.name (int),
## education.concentration.id (int), education.concentration.name (int),
## education.degree.id (int), education.degree.name (int),
## education.school.id (int), education.school.name (int), education.type
## (int), education.with.id (int), education.with.name (int),
## education.year.id (int), education.year.name (int), gender (int),
## hometown.id (int), hometown.name (int), id.1 (int), languages.id (int),
## languages.name (int), last_name (int), locale (int), location (int),
## location.id (int), location.name (int), middle_name (int), name (int),
## political (int), religion (int), work.description (int),
## work.employer.id (int), work.employer.name (int), work.end_date (int),
## work.from.id (int), work.from.name (int), work.location.id (int),
## work.location.name (int), work.position.id (int), work.position.name
## (int), work.projects.description (int), work.projects.end_date (int),
## work.projects.from.id (int), work.projects.from.name (int),
## work.projects.id (int), work.projects.name (int),
## work.projects.start_date (int), work.projects.with.id (int),
## work.projects.with.name (int), work.start_date (int), work.with.id
## (int), work.with.name (int)
There are lots of missing values. Summarize the data to visualize where the missing values are.
summary(tbl)
## id education.classes.description education.classes.from.id
## Min. :2474 Min. : NA Min. : 99
## 1st Qu.:2513 1st Qu.: NA 1st Qu.:100
## Median :2552 Median : NA Median :100
## Mean :2552 Mean :NaN Mean :101
## 3rd Qu.:2590 3rd Qu.: NA 3rd Qu.:102
## Max. :2629 Max. : NA Max. :103
## NA's :156 NA's :150
## education.classes.from.name education.classes.id education.classes.name
## Min. : 99 Min. : 58 Min. : 20
## 1st Qu.:100 1st Qu.:318 1st Qu.:114
## Median :100 Median :322 Median :310
## Mean :101 Mean :278 Mean :229
## 3rd Qu.:102 3rd Qu.:324 3rd Qu.:313
## Max. :103 Max. :328 Max. :317
## NA's :150 NA's :142 NA's :142
## education.classes.with.id education.classes.with.name
## Min. :299 Min. :299
## 1st Qu.:304 1st Qu.:304
## Median :305 Median :305
## Mean :306 Mean :306
## 3rd Qu.:307 3rd Qu.:307
## Max. :314 Max. :314
## NA's :147 NA's :147
## education.concentration.id education.concentration.name
## Min. : 74 Min. : 25
## 1st Qu.:586 1st Qu.: 74
## Median :587 Median :545
## Mean :566 Mean :342
## 3rd Qu.:598 3rd Qu.:550
## Max. :610 Max. :561
## NA's :117 NA's :117
## education.degree.id education.degree.name education.school.id
## Min. :104 Min. :100 Min. :2409
## 1st Qu.:104 1st Qu.:100 1st Qu.:2410
## Median :104 Median :100 Median :2415
## Mean :104 Mean :100 Mean :2442
## 3rd Qu.:104 3rd Qu.:100 3rd Qu.:2464
## Max. :104 Max. :100 Max. :2563
## NA's :155 NA's :155 NA's :25
## education.school.name education.type education.with.id
## Min. :2296 Min. :0.00 Min. :531
## 1st Qu.:2296 1st Qu.:1.00 1st Qu.:562
## Median :2297 Median :1.00 Median :566
## Mean :2325 Mean :1.05 Mean :570
## 3rd Qu.:2344 3rd Qu.:1.00 3rd Qu.:590
## Max. :2445 Max. :2.00 Max. :604
## NA's :25 NA's :25 NA's :138
## education.with.name education.year.id education.year.name gender
## Min. :530 Min. : 0.0 Min. : 0.00 Min. :0.000
## 1st Qu.:561 1st Qu.: 2.0 1st Qu.: 2.00 1st Qu.:0.000
## Median :566 Median : 8.0 Median : 7.00 Median :0.000
## Mean :569 Mean :10.6 Mean : 8.71 Mean :0.417
## 3rd Qu.:589 3rd Qu.:17.0 3rd Qu.:15.00 3rd Qu.:1.000
## Max. :603 Max. :59.0 Max. :51.00 Max. :1.000
## NA's :138 NA's :61 NA's :61
## hometown.id hometown.name id.1 languages.id languages.name
## Min. : 12 Min. : 12 Min. :2474 Min. : 0 Min. : 0.0
## 1st Qu.:534 1st Qu.:531 1st Qu.:2513 1st Qu.: 40 1st Qu.: 14.0
## Median :547 Median :544 Median :2552 Median :122 Median : 35.0
## Mean :501 Mean :498 Mean :2552 Mean :133 Mean : 56.7
## 3rd Qu.:562 3rd Qu.:558 3rd Qu.:2590 3rd Qu.:231 3rd Qu.:103.0
## Max. :588 Max. :585 Max. :2629 Max. :239 Max. :185.0
## NA's :41 NA's :41 NA's :108 NA's :108
## last_name locale location location.id location.name
## Min. : 580 Min. : 0.0 Min. : NA Min. : 1 Min. : 1
## 1st Qu.:2020 1st Qu.:22.0 1st Qu.: NA 1st Qu.:418 1st Qu.:416
## Median :2054 Median :22.0 Median : NA Median :418 Median :416
## Mean :2013 Mean :19.6 Mean :NaN Mean :419 Mean :417
## 3rd Qu.:2088 3rd Qu.:22.0 3rd Qu.: NA 3rd Qu.:423 3rd Qu.:421
## Max. :2126 Max. :22.0 Max. : NA Max. :447 Max. :445
## NA's :156 NA's :26 NA's :26
## middle_name name political religion work.description
## Min. : 12 Min. :2468 Min. :6 Min. :12 Min. :249
## 1st Qu.:182 1st Qu.:2506 1st Qu.:6 1st Qu.:12 1st Qu.:252
## Median :188 Median :2544 Median :6 Median :12 Median :257
## Mean :175 Mean :2545 Mean :6 Mean :12 Mean :258
## 3rd Qu.:194 3rd Qu.:2583 3rd Qu.:6 3rd Qu.:12 3rd Qu.:264
## Max. :199 Max. :2622 Max. :6 Max. :12 Max. :268
## NA's :133 NA's :155 NA's :155 NA's :141
## work.employer.id work.employer.name work.end_date work.from.id
## Min. :1023 Min. :1008 Min. : 4 Min. :43.0
## 1st Qu.:1649 1st Qu.:1627 1st Qu.: 5 1st Qu.:43.5
## Median :1679 Median :1657 Median : 5 Median :44.0
## Mean :1676 Mean :1654 Mean : 29 Mean :44.0
## 3rd Qu.:1720 3rd Qu.:1698 3rd Qu.: 62 3rd Qu.:44.5
## Max. :1748 Max. :1726 Max. :119 Max. :45.0
## NA's :83 NA's :83 NA's :125 NA's :153
## work.from.name work.location.id work.location.name work.position.id
## Min. :43.0 Min. :270 Min. :266 Min. :162
## 1st Qu.:43.5 1st Qu.:274 1st Qu.:270 1st Qu.:789
## Median :44.0 Median :276 Median :272 Median :803
## Mean :44.0 Mean :278 Mean :273 Mean :784
## 3rd Qu.:44.5 3rd Qu.:282 3rd Qu.:277 3rd Qu.:826
## Max. :45.0 Max. :292 Max. :286 Max. :839
## NA's :153 NA's :118 NA's :118 NA's :111
## work.position.name work.projects.description work.projects.end_date
## Min. : 84 Min. :66 Min. : NA
## 1st Qu.:758 1st Qu.:66 1st Qu.: NA
## Median :772 Median :66 Median : NA
## Mean :685 Mean :66 Mean :NaN
## 3rd Qu.:794 3rd Qu.:66 3rd Qu.: NA
## Max. :806 Max. :66 Max. : NA
## NA's :111 NA's :155 NA's :156
## work.projects.from.id work.projects.from.name work.projects.id
## Min. :34.0 Min. :34.0 Min. :171
## 1st Qu.:34.5 1st Qu.:34.5 1st Qu.:172
## Median :35.0 Median :35.0 Median :173
## Mean :35.0 Mean :35.0 Mean :173
## 3rd Qu.:35.5 3rd Qu.:35.5 3rd Qu.:175
## Max. :36.0 Max. :36.0 Max. :176
## NA's :153 NA's :153 NA's :151
## work.projects.name work.projects.start_date work.projects.with.id
## Min. :171 Min. : NA Min. : 98.0
## 1st Qu.:172 1st Qu.: NA 1st Qu.: 98.8
## Median :173 Median : NA Median :101.0
## Mean :173 Mean :NaN Mean :101.2
## 3rd Qu.:175 3rd Qu.: NA 3rd Qu.:103.5
## Max. :176 Max. : NA Max. :105.0
## NA's :151 NA's :156 NA's :152
## work.projects.with.name work.start_date work.with.id work.with.name
## Min. : 98.0 Min. : 2.0 Min. :181 Min. :181
## 1st Qu.: 98.8 1st Qu.: 2.0 1st Qu.:181 1st Qu.:181
## Median :101.0 Median : 2.0 Median :181 Median :181
## Mean :101.2 Mean : 19.1 Mean :181 Mean :181
## 3rd Qu.:103.5 3rd Qu.: 26.0 3rd Qu.:181 3rd Qu.:181
## Max. :105.0 Max. :127.0 Max. :181 Max. :181
## NA's :152 NA's :89 NA's :155 NA's :155
Clustering on the ego’s friends and plot the Dendrogram
dist_friends <- dist(scale(ego_friends_features))
hclustering <- hclust(dist_friends)
plot(hclustering, xlab = "", ylab = "")
Cut the hierarchical clustering into groups. Experiment k = 5 for cutting:
circles <- cutree(hclustering, k = 5)
Add the circle indicators to the friends’ features.
ego_friends_features_circles <- cbind(ego_friends_features, circles)
Write the submission results in the required format.
#Create a empty data frame with the required column names
df <- data.frame(UserId = numeric(), Predicted = character())
#Get the list of circles
circles_list <- lapply(c(1:5), function(e) {ego_friends_features_circles[ego_friends_features_circles$circles == e, ]$id})
#Separate the circles by ";"
circles_str <- lapply(circles_list, function(x) paste(x, collapse = " "))
submission_result <- paste(circles_str, collapse = ";")
#Append a result to the data frame
df <- rbind(df, data.frame(UserId = ego, Predicted = submission_result))
#write the results to a file
write.csv(df, file = "submission/hcluster-expl.csv", quote = FALSE, row.names = FALSE)
Find the circles for the sample egos in the sample_submission.csv file:
num_circle <- 3
df <- data.frame(UserId = numeric(), Predicted = character())
for(j in 1:length(sample_egos)){
ego <- sample_egos[j]
ego_friends_file <- paste("egonets/", ego, ".egonet", sep ="")
lines <- readLines(ego_friends_file)
ego_friends <- unlist(lapply(lines, function(x) {strsplit(x, ":")[[1]][[1]]}))
ego_friends <- as.integer(ego_friends)
ego_friends_features <- egos[egos$id %in% ego_friends, ]
dist_friends <- dist(scale(ego_friends_features))
hclustering <- hclust(dist_friends)
circles <- cutree(hclustering, k = num_circle)
ego_friends_features_circles <- cbind(ego_friends_features, circles)
circles_list <- lapply(c(1:num_circle), function(e) {ego_friends_features_circles[ego_friends_features_circles$circles == e, ]$id})
#Separate the circles by ";"
circles_str <- lapply(circles_list, function(x) paste(x, collapse = " "))
submission_result <- paste(circles_str, collapse = ";")
#Append a result to the data frame
df <- rbind(df, data.frame(UserId = ego, Predicted = submission_result))
}
#write the results to a file
write.csv(df, file = "submission/hcluster-submission-subcols.csv", quote = FALSE, row.names = FALSE)
Find the circles for the all the egos in testSet:
testlines <- readLines("testSet_users_friends.csv")
testset <-unlist(lapply(testlines, function(x) strsplit(x, ":")[[1]][[1]]))
testset <-as.integer(testset)
num_circle <- 3
df <- data.frame(UserId = numeric(), Predicted = character())
for(j in 1:length(testset)){
ego <- testset[j]
ego_friends_file <- paste("egonets/", ego, ".egonet", sep ="")
lines <- readLines(ego_friends_file)
ego_friends <- unlist(lapply(lines, function(x) {strsplit(x, ":")[[1]][[1]]}))
ego_friends <- as.integer(ego_friends)
ego_friends_features <- egos[egos$id %in% ego_friends, ]
dist_friends <- dist(scale(ego_friends_features))
hclustering <- hclust(dist_friends)
circles <- cutree(hclustering, k = num_circle)
ego_friends_features_circles <- cbind(ego_friends_features, circles)
circles_list <- lapply(c(1:num_circle), function(e) {ego_friends_features_circles[ego_friends_features_circles$circles == e, ]$id})
#Separate the circles by ";"
circles_str <- lapply(circles_list, function(x) paste(x, collapse = " "))
submission_result <- paste(circles_str, collapse = ";")
#Append a result to the data frame
df <- rbind(df, data.frame(UserId = ego, Predicted = submission_result))
}
#write the results to a file
write.csv(df, file = "submission/hcluster-test-submission-subcols.csv", quote = FALSE, row.names = FALSE)
Analyse a randomly chosen file containing some circles. Pick up a circle
circles <- readLines("Training/12800.circles")
acircle <- strsplit(circles[1], ":")[[1]][2]
library(stringr)
acircle <- str_trim(acircle)
acircle_ids <- as.integer(strsplit(acircle, " ")[[1]])
Get all the profiles of the friends in the circle.
circle_friends <- egos[egos$id %in% acircle_ids, ]
#summary(circle_friends)
Remove the features that have two many missing values.
circle_friends_nonafeatures <- circle_friends[, colSums(is.na(circle_friends)) < 30]
#summary(circle_friends_nonafeatures)
For the remaining features, what features have a lot of common values:
for(i in 1:ncol(circle_friends_nonafeatures)){
print(colnames(circle_friends_nonafeatures[i]))
print(table(circle_friends_nonafeatures[[i]]))
writeLines("\n\n")
}
## [1] "id"
##
## 12827 12851 12859 12869 12891 12897 12920 12923 12942 12944 12961 12972
## 1 1 1 1 1 1 1 1 1 1 1 1
## 12977 12993 13003 13042 13048 13053 13064 13084 13092 13104 13113 13120
## 1 1 1 1 1 1 1 1 1 1 1 1
## 13121 13178 13226 13234 13236 13262 13264 13274 13276 13318 13344
## 1 1 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "education.concentration.id"
##
## 17 18 29 44 111 129 172 233 235 265 279 652 2979 2981 3001
## 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "education.concentration.name"
##
## 17 18 29 44 105 128 169 210 227 255 268 601 2844 2846 2866
## 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "education.school.id"
##
## 570 1026 2382 2610 3244 3291 3386 4230 6357 6897 11749 12040
## 1 1 1 1 1 1 1 1 1 1 8 1
## 12049 12079 12100 12106 12129 12135 12151 12152 12212
## 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "education.school.name"
##
## 554 987 1270 2269 2491 3084 3131 3225 4038 6043 6331 6547
## 1 1 1 1 1 1 1 1 1 1 1 1
## 11130 11399 11408 11454 11477 11495 11496 11549
## 9 1 1 1 1 1 1 1
##
##
##
## [1] "education.type"
##
## 0 1 2
## 1 19 8
##
##
##
## [1] "education.year.id"
##
## 0 1 4 5 6 7 8 9 29
## 2 2 1 3 2 2 3 2 2
##
##
##
## [1] "education.year.name"
##
## 0 1 2 4 5 6 7 8 25
## 2 2 2 1 3 2 3 2 2
##
##
##
## [1] "gender"
##
## 0
## 35
##
##
##
## [1] "hometown.id"
##
## 98 604 762 1358 2677 2692
## 1 1 18 1 1 1
##
##
##
## [1] "hometown.name"
##
## 98 601 757 1352 2667 2682
## 1 1 18 1 1 1
##
##
##
## [1] "id.1"
##
## 12827 12851 12859 12869 12891 12897 12920 12923 12942 12944 12961 12972
## 1 1 1 1 1 1 1 1 1 1 1 1
## 12977 12993 13003 13042 13048 13053 13064 13084 13092 13104 13113 13120
## 1 1 1 1 1 1 1 1 1 1 1 1
## 13121 13178 13226 13234 13236 13262 13264 13274 13276 13318 13344
## 1 1 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "last_name"
##
## 666 681 1254 1257 1325 1978 2874 3926 6138 8687 8885 8900 8903 8923 8949
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2
## 8963 8982 8988 9012 9015 9024 9034 9036 9038 9048 9054 9089 9122 9135 9136
## 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## 9141 9172 9184
## 1 1 1
##
##
##
## [1] "locale"
##
## 0 1 3
## 2 32 1
##
##
##
## [1] "location.id"
##
## 58 175 247 416 544 578 595 649 822 1315 1411 1832 1838 1841 1852
## 1 1 1 6 3 1 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "location.name"
##
## 58 175 247 414 538 572 589 643 815 1308 1404 1824 1830 1833 1844
## 1 1 1 6 3 1 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "name"
##
## 12754 12778 12786 12796 12818 12824 12847 12850 12869 12871 12888 12897
## 1 1 1 1 1 1 1 1 1 1 1 1
## 12902 12918 12928 12967 12973 12978 12989 13009 13017 13029 13038 13045
## 1 1 1 1 1 1 1 1 1 1 1 1
## 13046 13103 13151 13159 13161 13187 13189 13199 13201 13243 13269
## 1 1 1 1 1 1 1 1 1 1 1
##
##
##
## [1] "work.employer.id"
##
## 865 8936 8961 8971 8988 9001 9028 9034 9043 9058 9089 9093 9158 9194 9209
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 9245
## 1
##
##
##
## [1] "work.employer.name"
##
## 854 8659 8682 8692 8708 8721 8748 8754 8763 8776 8806 8810 8871 8907 8922
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 8957
## 1
##
##
##
## [1] "work.end_date"
##
## 5
## 6
##
##
##
## [1] "work.position.id"
##
## 715 1616 2019 2757 4209 4216 4218 4316
## 1 1 1 1 1 1 1 1
##
##
##
## [1] "work.position.name"
##
## 693 1550 1943 2665 4078 4085 4087 4180
## 1 1 1 1 1 1 1 1
##
##
##
## [1] "work.start_date"
##
## 2 3 116
## 8 1 1
Select those seeminly useful features and re-clustering.
useful_features <- c("id", "education.school.id", "education.school.name", "education.type", "education.year.id", "education.year.name", "gender", "hometown.id", "hometown.name", "locale", "location.id", "location.name", "work.end_date", "work.employer.name", "work.start_date")
egos <- egos[, useful_features]
num_circle <- 5
df <- data.frame(UserId = numeric(), Predicted = character())
for(j in 1:length(sample_egos)){
#for(j in 1:1){
ego <- sample_egos[j]
ego_friends_file <- paste("egonets/", ego, ".egonet", sep ="")
lines <- readLines(ego_friends_file)
ego_friends <- unlist(lapply(lines, function(x) {strsplit(x, ":")[[1]][[1]]}))
ego_friends <- as.integer(ego_friends)
ego_friends_features <- egos[egos$id %in% ego_friends, ]
#ego_friends_features <- subset(ego_friends_features, select = -c(id))
dist_friends <- dist(scale(ego_friends_features))
hclustering <- hclust(dist_friends)
circles <- cutree(hclustering, k = num_circle)
ego_friends_features_circles <- cbind(ego_friends_features, circles)
circles_list <- lapply(c(1:num_circle), function(e) {ego_friends_features_circles[ego_friends_features_circles$circles == e, ]$id})
#Separate the circles by ";"
circles_str <- lapply(circles_list, function(x) paste(x, collapse = " "))
submission_result <- paste(circles_str, collapse = ";")
#Append a result to the data frame
df <- rbind(df, data.frame(UserId = ego, Predicted = submission_result))
}
#write the results to a file
write.csv(df, file = "submission/hcluster-submission-usefulfeatures-6.csv", quote = FALSE, row.names = FALSE)