projects<-get(load("projects.RData"))
Scratch is a project from MIT Media Lab that helps young people to create online games, animations, interactive stories using easy-to-use block-type of coding platform.
MIT Scratch data includes about 2 million users and several different data tables. We’re exploring a few tables here today including main table project, users, and featured.
str(projects)
## 'data.frame': 1928699 obs. of 43 variables:
## $ project.id : int 2437817 2437816 2437815 2437814 2437813 2437812 2437811 2437810 2437809 2437808 ...
## $ user.id : int 1097258 1225957 859392 1286497 1056442 1225622 1291085 1184193 585404 982894 ...
## $ date.created : POSIXct, format: "2012-03-31 21:30:01" "2012-03-31 21:28:59" ...
## $ viewers.website : int 0 0 0 1 0 0 0 0 1 6 ...
## $ lovers.website : int NA NA NA NA NA NA NA NA NA NA ...
## $ downloaders.website : int 0 0 0 0 0 0 0 0 1 0 ...
## $ sprites.website : int 2 1 NA 1 7 2 2 7 1 NA ...
## $ scripts.website : int 1 1 2 1 21 21 2 18 1 1 ...
## $ parent.project.id : int 2437776 NA NA NA NA NA NA 2422058 NA NA ...
## $ parent.user.id : int 1097258 NA NA NA NA NA NA 1072582 NA NA ...
## $ locked : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ is.remix : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
## $ is.self.remix : logi TRUE FALSE FALSE FALSE FALSE FALSE ...
## $ is.remixed : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ blocks : int 2 5 6 30 75 79 27 78 6 2 ...
## $ block.types : int 2 5 6 10 23 19 14 15 2 2 ...
## $ images : int 3 2 2 5 10 4 5 10 3 1 ...
## $ sounds : int 3 1 1 2 1 1 2 1 6 1 ...
## $ ugstrings : int NA 2 1 11 7 NA 1 19 NA NA ...
## $ saves : int 4 NA NA 1 NA 3 1 NA 1 NA ...
## $ seconds.to.share : int 6 NA NA 199 NA 54957 21 NA 517 NA ...
## $ seconds.to.remix : int 0 0 0 0 0 0 0 0 0 0 ...
## $ is.remix.identical : logi TRUE NA NA NA NA NA ...
## $ is.remix.identical.media : logi TRUE NA NA NA NA NA ...
## $ is.remix.identical.code.and.strings: logi TRUE NA NA NA NA NA ...
## $ is.remix.identical.code : logi TRUE NA NA NA NA NA ...
## $ remix.added.images : int 0 NA NA NA NA NA NA 0 NA NA ...
## $ remix.deleted.images : int 0 NA NA NA NA NA NA 0 NA NA ...
## $ remix.changed.images : int 0 NA NA NA NA NA NA 7 NA NA ...
## $ remix.added.sounds : int 0 NA NA NA NA NA NA 0 NA NA ...
## $ remix.deleted.sounds : int 0 NA NA NA NA NA NA 0 NA NA ...
## $ remix.changed.sounds : int 1 NA NA NA NA NA NA 0 NA NA ...
## $ remix.editdistance.code.and.strings: int 0 NA NA NA NA NA NA 0 NA NA ...
## $ remix.editdistance.code : int 0 NA NA NA NA NA NA 0 NA NA ...
## $ remix.editdistance.media : int 1 NA NA NA NA NA NA 7 NA NA ...
## $ remix.editdistance.images : int 0 NA NA NA NA NA NA 7 NA NA ...
## $ remix.editdistance.sounds : int 1 NA NA NA NA NA NA 0 NA NA ...
## $ remix.derivativeness : int 89 NA NA NA NA NA NA 97 NA NA ...
## $ remix.derivativeness.images : int 100 NA NA NA NA NA NA 76 NA NA ...
## $ remix.derivativeness.sounds : int 75 NA NA NA NA NA NA 100 NA NA ...
## $ remix.derivativeness.blocks : int 100 NA NA NA NA NA NA 100 NA NA ...
## $ remix.derivativeness.text : int 0 NA NA NA NA NA NA 100 NA NA ...
## $ remix.derivativeness.parameters : int 0 NA NA NA NA NA NA 100 NA NA ...
We need to transform the dataset to the format we want to further analyze. To do this we need some data wrangling activities including data cleaning, transformation, recoding and formating.
Data is recoded to 0 and 1 from TRUE and FALSE. All NAs coded to 0 for non-activity.
#Recoding logical values
#replace NA to be 0
projects$blocks[is.na(projects$blocks)] <- 0
projects$block.types[is.na(projects$block.types)] <- 0
projects$lovers.website[is.na(projects$lovers.website)] <- 0
projects$sprites.website[is.na(projects$sprites.website)] <- 0
projects$scripts.website[is.na(projects$scripts.website)] <- 0
projects$remix.derivativeness[is.na(projects$remix.derivativeness)] <- 0
users<-get(load("users.rdata"))
users$user.id<-as.character(users$user.id)
#Merge with User table
uProjects<-merge(projects, users, by="user.id", all.x=TRUE)
#Exclude all users with admin status
usProjects<-uProjects[ which(uProjects$admin=='FALSE'), ]
uProjects$admin <- NULL
dim(usProjects)
## [1] 1928260 46
Assuming all users that either has no project and 1 project do not actively engaged to the website.
library(data.table)
library(dtplyr)
#Create a new variable called **Counts** that counts unique projects by each user
countd<-setDT(usProjects)[, .(count = uniqueN(project.id)), by = user.id]
#Filtered to only users who created more than 1 project
new.id<-subset(countd, count > 1)
#Merge to the actual dataset
newProjects<-merge(new.id, usProjects, by="user.id", all.x=TRUE)
#dataset reduced to 1781019 obs. of 47 variables
dim(newProjects)
## [1] 1781019 47
We don’t want to see the projects that are copied from the parent projects, so filtered to only parent projects.
#Only filtered to parent projects
newProjects<-subset(newProjects, remix.derivativeness<=0)
#Delete the variables that related to remixed projects
newProjects$is.remix <- NULL
newProjects$is.self.remix <- NULL
newProjects$is.remixed <- NULL
newProjects$seconds.to.remix <- NULL
newProjects$is.remix.identical <- NULL
newProjects$is.remix.identical.code <- NULL
newProjects$is.remix.identical.media <- NULL
newProjects$is.remix.identical.code.and.strings <- NULL
newProjects$remix.added.images<- NULL
newProjects$remix.deleted.images <- NULL
newProjects$remix.changed.images <- NULL
newProjects$remix.added.sounds <- NULL
newProjects$remix.deleted.sounds <- NULL
newProjects$remix.changed.sounds <- NULL
newProjects$remix.editdistance.code.and.strings <- NULL
newProjects$remix.editdistance.code <- NULL
newProjects$remix.editdistance.media <- NULL
newProjects$remix.editdistance.images <- NULL
newProjects$remix.editdistance.sounds <- NULL
newProjects$remix.derivativeness <- NULL
newProjects$remix.derivativeness.images <- NULL
newProjects$remix.derivativeness.sounds <- NULL
newProjects$remix.derivativeness.blocks <- NULL
newProjects$remix.derivativeness.text <- NULL
newProjects$remix.derivativeness.parameters <- NULL
#dataset reduced to 1344711 obs. of 22 variables
dim(newProjects)
## [1] 1344711 22
Duration variable shows how long an user stayed in Scratch. It is number of days between last project date and first project date.
#Create duration of the projects
newProjects$date.created.x2<- as.Date(newProjects$date.created.x)
library(plyr)
durProject<-ddply(newProjects, .(user.id), summarize, duration=(max(date.created.x2)-min(date.created.x2))
)
durationProject<-merge(durProject,newProjects, by="user.id",all.y=TRUE)
head(durationProject$duration, n=20)
## Time differences in days
## [1] 1173 1173 1173 1173 1173 1173 1173 1173 1173 1173 1173 1173 1173 1173
## [15] 1173 1173 1173 1173 56 56
#Rank the projects using RANK function
library(dplyr)
ranked<-durationProject %>%
group_by(user.id) %>%
mutate(rank = rank(date.created.x)) %>%
arrange(user.id, date.created.x, rank)
summary(ranked$rank)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 3.00 10.00 48.56 36.00 2779.00
UserCategory categorizes users by the year they started doing project in Scratch. For example, if users started doing projects in year 2007, then they are categorized as “users2007”.
ranked$date<-as.numeric(format(ranked$date.created.x, "%Y"))
usersProject<-ranked
usersProject$ranked<-usersProject$rank
usersProject$usercategory <- ifelse((usersProject$ranked=1 & usersProject$date==2007), "users2007", ifelse((usersProject$ranked=1 & usersProject$date==2008), "users2008",
ifelse((usersProject$ranked=1 & usersProject$date==2009), "users2009", ifelse((usersProject$ranked=1 & usersProject$date==2010), "users2010", ifelse((usersProject$ranked=1 & usersProject$date==2011), "users2011", "new users")))
))
dim(usersProject)
## [1] 1344711 28
Featured variable identifies whether a project was featured or not.
featured<-get(load("featured_projects.rdata"))
dim(featured)
## [1] 1227 2
usersProject$project.id<-as.numeric(usersProject$project.id)
joined_data<-merge(usersProject, featured, by.x="project.id", all.x=TRUE)
joined_data$featured <- ifelse(is.na(joined_data$date.featured),0,1)
#There are only 948 projects featured
table(joined_data$featured)
##
## 0 1
## 1343763 948
dim(joined_data)
## [1] 1344711 30
#convert to numeric
joined_data$viewers.website<-as.numeric(joined_data$viewers.website)
joined_data$lovers.website<-as.numeric(joined_data$lovers.website)
joined_data$downloaders.website<-as.numeric(joined_data$downloaders.website)
joined_data$sprites.website<-as.numeric(joined_data$sprites.website)
joined_data$scripts.website<-as.numeric(joined_data$scripts.website)
joined_data$seconds.to.share<-as.numeric(joined_data$seconds.to.share)
#Deleted more variables that's not necessary
joined_data$images<-NULL
joined_data$sounds<-NULL
joined_data$ugstrings<-NULL
joined_data$saves<-NULL
joined_data$admin<-NULL
joined_data$date.featured<-NULL
joined_data$parent.project.id<-NULL
joined_data$parent.user.id<-NULL
joined_data$locked<-NULL
joined_data$date.created.y<-NULL
joined_data$date.created.x2<-NULL
joined_data$ranked<-NULL
# filtered to duration more than 1 days
last_data<-subset(joined_data, as.numeric(duration)>1)
dim(last_data)
## [1] 1221582 18
#filtered to users who signed earlier than 2012
last_data<-subset(last_data, usercategory!="new users")
table(last_data$usercategory)
##
## users2007 users2008 users2009 users2010 users2011
## 37561 138115 221786 328467 396393
#filtered to more than 1 scripts
last_data<-subset(last_data, scripts.website>0)
dim(last_data)
## [1] 1032495 18
#filtered to more than 1 blocks
last_data<-subset(last_data, blocks>0)
dim(last_data)
## [1] 1028449 18
#filtered to more than 1 sprites
last_data<-subset(last_data, sprites.website>0)
dim(last_data)
## [1] 960774 18
#filtered to more than 1 block types
last_data<-subset(last_data, block.types>0)
dim(last_data)
## [1] 960750 18
#Variables we have now:
summary(last_data)
## project.id user.id duration count
## Min. : 107 Min. : 141 Length:960750 Min. : 2.0
## 1st Qu.: 568020 1st Qu.: 208896 Class :difftime 1st Qu.: 11.0
## Median :1170304 Median : 430934 Mode :numeric Median : 32.0
## Mean :1146670 Mean : 455481 Mean : 125.3
## 3rd Qu.:1724604 3rd Qu.: 679323 3rd Qu.: 112.0
## Max. :2250216 Max. :1124497 Max. :3047.0
##
## date.created.x viewers.website lovers.website
## Min. :2007-03-05 07:50:31 Min. : 0.00 Min. : 0.00
## 1st Qu.:2009-06-12 15:24:40 1st Qu.: 4.00 1st Qu.: 0.00
## Median :2010-07-03 19:59:05 Median : 12.00 Median : 0.00
## Mean :2010-04-25 03:11:34 Mean : 36.67 Mean : 0.91
## 3rd Qu.:2011-04-14 15:01:32 3rd Qu.: 27.00 3rd Qu.: 0.00
## Max. :2011-12-31 23:54:32 Max. :111682.00 Max. :3787.00
##
## downloaders.website sprites.website scripts.website
## Min. : 0.000 Min. : 1.00 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.: 1.00 1st Qu.: 2.00
## Median : 0.000 Median : 3.00 Median : 5.00
## Mean : 2.401 Mean : 5.62 Mean : 16.59
## 3rd Qu.: 1.000 3rd Qu.: 6.00 3rd Qu.: 13.00
## Max. :8770.000 Max. :1905.00 Max. :200000.00
##
## blocks block.types seconds.to.share
## Min. : 1.0 Min. : 1.00 Min. :-1.317e+09
## 1st Qu.: 13.0 1st Qu.: 6.00 1st Qu.: 4.340e+02
## Median : 32.0 Median : 10.00 Median : 7.241e+04
## Mean : 102.2 Mean : 13.13 Mean : 3.341e+06
## 3rd Qu.: 85.0 3rd Qu.: 18.00 3rd Qu.: 7.855e+05
## Max. :252237.0 Max. :118.00 Max. : 1.325e+09
## NA's :393304
## country rank date
## United States :523032 Min. : 1.00 Min. :2007
## United Kingdom : 79088 1st Qu.: 4.00 1st Qu.:2009
## Korea, Republic of: 65111 Median : 10.00 Median :2010
## Canada : 33464 Mean : 46.26 Mean :2010
## Taiwan : 24598 3rd Qu.: 35.00 3rd Qu.:2011
## Australia : 18385 Max. :2592.00 Max. :2011
## (Other) :217072
## usercategory featured
## Length:960750 Min. :0.0000000
## Class :character 1st Qu.:0.0000000
## Mode :character Median :0.0000000
## Mean :0.0009191
## 3rd Qu.:0.0000000
## Max. :1.0000000
##
Most users are from USA. UK and Korea are the second and third respectively.
In average users stayed 379 days and maximuim number of days is 1829.
# average duration
mean(last_data$duration)
## Time difference of 379.7643 days
#max and min number of days
last_data$duration<-as.numeric(last_data$duration)
summary(last_data$duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 63.0 248.0 379.8 604.0 1829.0
#number of days for each user category
library(ggplot2)
library(ggthemes)
hist_project<-subset(last_data, duration>7)
ggplot(hist_project, aes(x=duration)) +
geom_bar() +
facet_grid(".~usercategory") +
ggtitle("Figure 2. Number of Days Stayed") +
theme_classic() +
theme(legend.position="bottom") +
scale_y_continuous("Frequency")
#let's see it in the table
tapply(last_data$duration, last_data$usercategory, summary)
## $users2007
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 43.0 255.0 437.1 718.0 1829.0
##
## $users2008
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 71.0 348.0 495.9 840.0 1829.0
##
## $users2009
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 80.0 364.0 477.4 830.0 1829.0
##
## $users2010
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 74.0 306.0 388.4 621.0 1829.0
##
## $users2011
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 49.0 176.0 269.5 380.0 1829.0
tapply(last_data$count, last_data$usercategory, summary)
## $users2007
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 9.00 23.00 54.23 62.00 3047.00
##
## $users2008
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 11.0 33.0 109.5 100.0 3047.0
##
## $users2009
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 11.0 39.0 138.9 131.0 3047.0
##
## $users2010
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 11.0 34.0 142.8 133.0 3047.0
##
## $users2011
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2 11 29 116 101 3047
In Scratch, scripts are the block of codes and codes that make sprites or characters move or do any actions.
summary(last_data$scripts.website)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 5.00 16.59 13.00 200000.00
Let’s find what features are good for predicting to be featured in the Scratch.
# Make sure featured to be factor not number
last_data$featured <- factor(last_data$featured)
levels(last_data$featured)
## [1] "0" "1"
# run our regression model using all the variables first
project.log.all <- glm(featured~duration+count+scripts.website+sprites.website
+downloaders.website+blocks+block.types+lovers.website+viewers.website+rank
+usercategory,
data=last_data, family="binomial")
summary(project.log.all)
##
## Call:
## glm(formula = featured ~ duration + count + scripts.website +
## sprites.website + downloaders.website + blocks + block.types +
## lovers.website + viewers.website + rank + usercategory, family = "binomial",
## data = last_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4904 -0.0331 -0.0225 -0.0170 4.2882
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.062e+00 1.041e-01 -67.842 < 2e-16 ***
## duration 1.428e-03 8.032e-05 17.775 < 2e-16 ***
## count -1.145e-03 3.558e-04 -3.219 0.00129 **
## scripts.website -4.064e-05 1.210e-04 -0.336 0.73697
## sprites.website 3.566e-03 8.547e-04 4.172 3.02e-05 ***
## downloaders.website -2.047e-03 4.392e-04 -4.662 3.13e-06 ***
## blocks 1.623e-05 1.144e-05 1.418 0.15615
## block.types 5.596e-02 2.277e-03 24.574 < 2e-16 ***
## lovers.website 1.136e-04 1.020e-03 0.111 0.91130
## viewers.website 9.310e-04 7.043e-05 13.219 < 2e-16 ***
## rank -1.417e-03 1.001e-03 -1.415 0.15713
## usercategoryusers2008 -1.248e+00 9.889e-02 -12.618 < 2e-16 ***
## usercategoryusers2009 -2.069e+00 1.150e-01 -17.995 < 2e-16 ***
## usercategoryusers2010 -2.218e+00 1.158e-01 -19.160 < 2e-16 ***
## usercategoryusers2011 -2.637e+00 1.325e-01 -19.903 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14113 on 960749 degrees of freedom
## Residual deviance: 10395 on 960735 degrees of freedom
## AIC: 10425
##
## Number of Fisher Scoring iterations: 12
##One solution is to transform the coefficients to make them easier to interpret
project.tab.all <- coef(summary(project.log.all))
project.tab.all[, "Estimate"] <- exp(coef(project.log.all))
project.tab.all
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0008568915 1.040980e-01 -67.8418356 0.000000e+00
## duration 1.0014286769 8.031630e-05 17.7754367 1.095415e-70
## count 0.9988553566 3.557960e-04 -3.2189767 1.286489e-03
## scripts.website 0.9999593653 1.209880e-04 -0.3358640 7.369734e-01
## sprites.website 1.0035721933 8.546907e-04 4.1720684 3.018470e-05
## downloaders.website 0.9979546146 4.391712e-04 -4.6621456 3.129297e-06
## blocks 1.0000162277 1.144293e-05 1.4181297 1.561529e-01
## block.types 1.0575536662 2.277132e-03 24.5740613 2.392746e-133
## lovers.website 1.0001136428 1.020142e-03 0.1113927 9.113049e-01
## viewers.website 1.0009313881 7.042521e-05 13.2190526 6.811654e-40
## rank 0.9985843958 1.001277e-03 -1.4148010 1.571268e-01
## usercategoryusers2008 0.2871452608 9.888545e-02 -12.6183085 1.673777e-36
## usercategoryusers2009 0.1262850291 1.149909e-01 -17.9945875 2.148157e-72
## usercategoryusers2010 0.1087955590 1.157777e-01 -19.1598554 8.009538e-82
## usercategoryusers2011 0.0715448771 1.325148e-01 -19.9029045 3.840181e-88
#Define usercategory as a factor
last_data$usercategory<-factor(last_data$usercategory)
# Create a dataset with predictors set at desired levels
set.seed(200)
project.log.pred.all <- with(last_data,
expand.grid(duration = mean(last_data$duration),
count = mean(last_data$duration),
sprites.website = mean(last_data$sprites.website),
scripts.website = mean(last_data$scripts.website),
downloaders.website = mean(last_data$downloaders.website),
blocks = mean(last_data$blocks),
block.types = mean(last_data$block.types),
lovers.website = mean(last_data$lovers.website),
viewers.website = mean(last_data$viewers.website),
rank = mean(last_data$rank),
usercategory=c("users2008", "users2009", "users2010","users2011")))
# predict featured projects at those levels
cbind(project.log.pred.all, predict(project.log.all, type = "response",
se.fit = TRUE, interval="confidence",
newdata = project.log.pred.all))
## duration count sprites.website scripts.website downloaders.website
## 1 379.7643 379.7643 5.61988 16.59072 2.401377
## 2 379.7643 379.7643 5.61988 16.59072 2.401377
## 3 379.7643 379.7643 5.61988 16.59072 2.401377
## 4 379.7643 379.7643 5.61988 16.59072 2.401377
## blocks block.types lovers.website viewers.website rank
## 1 102.2194 13.12792 0.9104169 36.67256 46.26367
## 2 102.2194 13.12792 0.9104169 36.67256 46.26367
## 3 102.2194 13.12792 0.9104169 36.67256 46.26367
## 4 102.2194 13.12792 0.9104169 36.67256 46.26367
## usercategory fit se.fit residual.scale
## 1 users2008 0.0005620791 6.946056e-05 1
## 2 users2009 0.0002472774 3.362105e-05 1
## 3 users2010 0.0002130388 2.826645e-05 1
## 4 users2011 0.0001401063 2.067731e-05 1
# The results show that you have more chance to be featured if you're a user from 2008.
Let me create another model
#Let's leave all significant variables in the model
project.log.six <- glm(featured~duration+sprites.website
+downloaders.website+block.types+viewers.website
+usercategory,
data=last_data, family="binomial")
summary(project.log.six)
##
## Call:
## glm(formula = featured ~ duration + sprites.website + downloaders.website +
## block.types + viewers.website + usercategory, family = "binomial",
## data = last_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.4904 -0.0334 -0.0227 -0.0169 4.0474
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.019e+00 1.022e-01 -68.712 < 2e-16 ***
## duration 1.177e-03 7.330e-05 16.061 < 2e-16 ***
## sprites.website 3.680e-03 7.485e-04 4.917 8.79e-07 ***
## downloaders.website -1.894e-03 4.267e-04 -4.438 9.07e-06 ***
## block.types 5.868e-02 2.227e-03 26.345 < 2e-16 ***
## viewers.website 9.199e-04 5.002e-05 18.392 < 2e-16 ***
## usercategoryusers2008 -1.347e+00 9.791e-02 -13.763 < 2e-16 ***
## usercategoryusers2009 -2.212e+00 1.125e-01 -19.658 < 2e-16 ***
## usercategoryusers2010 -2.400e+00 1.111e-01 -21.596 < 2e-16 ***
## usercategoryusers2011 -2.826e+00 1.276e-01 -22.142 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 14113 on 960749 degrees of freedom
## Residual deviance: 10456 on 960740 degrees of freedom
## AIC: 10476
##
## Number of Fisher Scoring iterations: 11
##One solution is to transform the coefficients to make them easier to interpret
project.tab.six <- coef(summary(project.log.six))
project.tab.six[, "Estimate"] <- exp(coef(project.log.six))
project.tab.six
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0008947138 1.021508e-01 -68.712179 0.000000e+00
## duration 1.0011778835 7.329632e-05 16.060701 4.810811e-58
## sprites.website 1.0036870876 7.484790e-04 4.917048 8.785908e-07
## downloaders.website 0.9981078626 4.267230e-04 -4.438312 9.066707e-06
## block.types 1.0604377105 2.227409e-03 26.345295 5.810806e-153
## viewers.website 1.0009203096 5.001667e-05 18.391595 1.534013e-75
## usercategoryusers2008 0.2598915876 9.790668e-02 -13.763012 4.254288e-43
## usercategoryusers2009 0.1094612629 1.125333e-01 -19.658035 4.934716e-86
## usercategoryusers2010 0.0907295141 1.111252e-01 -21.596111 1.953883e-103
## usercategoryusers2011 0.0592694701 1.276127e-01 -22.142482 1.232588e-108
# Create a dataset with predictors set at desired levels
set.seed(201)
project.log.pred.six <- with(last_data,
expand.grid(duration = mean(last_data$duration),
sprites.website = mean(last_data$sprites.website),
downloaders.website = mean(last_data$downloaders.website),
block.types = mean(last_data$block.types),
viewers.website = mean(last_data$viewers.website),
usercategory=c("users2008", "users2009", "users2010","users2011")))
# predict featured projects at those levels
cbind(project.log.pred.six, predict(project.log.six, type = "response",
se.fit = TRUE, interval="confidence",
newdata = project.log.pred.six))
## duration sprites.website downloaders.website block.types viewers.website
## 1 379.7643 5.61988 2.401377 13.12792 36.67256
## 2 379.7643 5.61988 2.401377 13.12792 36.67256
## 3 379.7643 5.61988 2.401377 13.12792 36.67256
## 4 379.7643 5.61988 2.401377 13.12792 36.67256
## usercategory fit se.fit residual.scale
## 1 users2008 0.0008250810 6.646288e-05 1
## 2 users2009 0.0003476740 3.359117e-05 1
## 3 users2010 0.0002881949 2.646845e-05 1
## 4 users2011 0.0001882834 2.048066e-05 1
Let’s try random forest model because random forest algorithms help you to see which variables are important to your model.
library(randomForest)
#To run randomforest, all variables need to be numeric
last_data$count<-as.numeric(last_data$count)
set.seed(300)
project.rf.all <- randomForest(featured ~ duration+count+scripts.website+sprites.website
+downloaders.website+blocks+block.types+lovers.website+viewers.website+rank, last_data, importance=TRUE, proximity=FALSE, ntree=100, keepForest=FALSE)
print(project.rf.all)
##
## Call:
## randomForest(formula = featured ~ duration + count + scripts.website + sprites.website + downloaders.website + blocks + block.types + lovers.website + viewers.website + rank, data = last_data, importance = TRUE, proximity = FALSE, ntree = 100, keepForest = FALSE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 0.09%
## Confusion matrix:
## 0 1 class.error
## 0 959805 62 6.459228e-05
## 1 818 65 9.263873e-01
#Find the important variables
imp<-importance(project.rf.all)
imp
## 0 1 MeanDecreaseAccuracy
## duration 17.412215 5.692023 18.371301
## count 22.884475 -10.742340 22.109018
## scripts.website 15.353968 -5.370541 15.288251
## sprites.website 15.310424 -6.095096 15.028929
## downloaders.website 1.719113 8.411416 3.990379
## blocks 15.513761 -7.970001 15.161342
## block.types 18.305481 -4.720572 18.525042
## lovers.website -10.504469 19.092379 -6.851627
## viewers.website -1.718105 20.869036 1.478447
## rank 36.364056 -7.895320 34.272650
## MeanDecreaseGini
## duration 193.2656
## count 164.5450
## scripts.website 129.4380
## sprites.website 103.4511
## downloaders.website 220.7476
## blocks 163.1836
## block.types 135.6413
## lovers.website 208.1411
## viewers.website 302.2849
## rank 143.7522
varImpPlot(project.rf.all, cex=.8, pch=19, col="navyblue", main="Variable Importance")
VarName<-c("duration", "count", "scripts.website", "sprites.website",
"downloaders.website", "blocks", "block.types",
"lovers.website", "viewers.website", "rank")
#Correlation between variables
checkCor <- round( cor(last_data[,VarName],
method="spearman"),2)
checkCor
## duration count scripts.website sprites.website
## duration 1.00 0.67 0.08 0.06
## count 0.67 1.00 -0.03 -0.02
## scripts.website 0.08 -0.03 1.00 0.76
## sprites.website 0.06 -0.02 0.76 1.00
## downloaders.website 0.23 0.15 0.18 0.15
## blocks 0.07 -0.07 0.85 0.67
## block.types 0.06 -0.08 0.70 0.48
## lovers.website 0.20 0.15 0.13 0.11
## viewers.website 0.38 0.28 0.18 0.14
## rank 0.56 0.83 -0.02 0.00
## downloaders.website blocks block.types lovers.website
## duration 0.23 0.07 0.06 0.20
## count 0.15 -0.07 -0.08 0.15
## scripts.website 0.18 0.85 0.70 0.13
## sprites.website 0.15 0.67 0.48 0.11
## downloaders.website 1.00 0.20 0.19 0.43
## blocks 0.20 1.00 0.83 0.14
## block.types 0.19 0.83 1.00 0.12
## lovers.website 0.43 0.14 0.12 1.00
## viewers.website 0.62 0.16 0.12 0.55
## rank 0.13 -0.04 -0.03 0.11
## viewers.website rank
## duration 0.38 0.56
## count 0.28 0.83
## scripts.website 0.18 -0.02
## sprites.website 0.14 0.00
## downloaders.website 0.62 0.13
## blocks 0.16 -0.04
## block.types 0.12 -0.03
## lovers.website 0.55 0.11
## viewers.website 1.00 0.18
## rank 0.18 1.00
#Plot correlation
library(corrplot)
layout(matrix(1:1, ncol = 3))
corrplot(checkCor, method="number", tl.cex = 1)
layout(1)
project.rf.six <- randomForest(featured ~ duration+rank+scripts.website
+downloaders.website+block.types+viewers.website, last_data, importance=TRUE, proximity=FALSE, ntree=100, keepForest=FALSE)
print(project.rf.six)
##
## Call:
## randomForest(formula = featured ~ duration + rank + scripts.website + downloaders.website + block.types + viewers.website, data = last_data, importance = TRUE, proximity = FALSE, ntree = 100, keepForest = FALSE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.09%
## Confusion matrix:
## 0 1 class.error
## 0 959809 58 6.042504e-05
## 1 836 47 9.467724e-01
VarNam6<-c("duration", "rank", "scripts.website",
"downloaders.website", "block.types", "viewers.website")
varNam6s <- c(VarNam6,"featured")
#Plot Independent Variables
panel.smooth1 <- function(...)panel.smooth(col.smooth="#e41a1c",lwd=3,...)
pairs(last_data[, varNam6s], panel=panel.smooth1,
gap=0, las=1, pch=21, bg="#4daf4a", cex=0.9,
main="Featured Projects and Explanatory Variables")
last_project<-last_data
last_project$country<-NULL
last_project$date.created.x<-NULL
last_project$project.id<-NULL
last_project$usercategory<-NULL
last_project$seconds.to.share<-NULL
last_project_date<-NULL
str(last_project)
## 'data.frame': 960750 obs. of 13 variables:
## $ user.id : int 145 143 144 146 149 153 152 150 152 150 ...
## $ duration : num 35 56 56 81 49 63 49 56 49 56 ...
## $ count : num 4 3 6 7 4 4 7 7 7 7 ...
## $ viewers.website : num 1121 524 293 2055 384 ...
## $ lovers.website : num 7 6 1 29 4 2 0 2 0 4 ...
## $ downloaders.website: num 69 31 12 90 36 8 9 8 5 3 ...
## $ sprites.website : num 5 1 15 3 1 5 2 3 1 3 ...
## $ scripts.website : num 5 2 11 4 2 2 2 1 1 1 ...
## $ blocks : num 14 3 27 8 6 11 6 3 3 3 ...
## $ block.types : num 4 2 7 3 6 7 5 3 3 3 ...
## $ rank : num 1 1 1 1 1 1 1 1 2 2 ...
## $ date : num 2007 2007 2007 2007 2007 ...
## $ featured : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
df<-last_project[,-1]
#Create function to plot and show the Within Groups Sum of Squares in each cluster
wss <- (nrow(last_project)-1)*sum(apply(last_project,2,var))
for (i in 2:15) wss[i] <- sum(kmeans(last_project,
centers=i)$withinss)
plot(1:15, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares",
main="Assessing the Optimal Number of Clusters with the Elbow Method",
pch=20, cex=2)
## Warning: package 'cluster' was built under R version 3.3.2
## Warning: package 'caret' was built under R version 3.3.2
## duration count viewers.website
## Min. :-0.9835 Min. :-0.40886 Min. : -0.1494
## 1st Qu.:-0.8247 1st Qu.:-0.37900 1st Qu.: -0.1331
## Median :-0.3431 Median :-0.30935 Median : -0.1005
## Mean : 0.0000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.5838 3rd Qu.:-0.04401 3rd Qu.: -0.0394
## Max. : 3.7732 Max. : 9.69081 Max. :454.8754
## lovers.website downloaders.website sprites.website
## Min. : -0.0976 Min. : -0.0929 Min. : -0.39296
## 1st Qu.: -0.0976 1st Qu.: -0.0929 1st Qu.: -0.39296
## Median : -0.0976 Median : -0.0929 Median : -0.22284
## Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: -0.0976 3rd Qu.: -0.0542 3rd Qu.: 0.03233
## Max. :405.9589 Max. :339.2207 Max. :161.55696
## scripts.website blocks block.types
## Min. : -0.0375 Min. : -0.1531 Min. :-1.2160
## 1st Qu.: -0.0351 1st Qu.: -0.1350 1st Qu.:-0.7147
## Median : -0.0279 Median : -0.1062 Median :-0.3136
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: -0.0086 3rd Qu.: -0.0261 3rd Qu.: 0.4885
## Max. :480.8525 Max. :381.4510 Max. :10.5146
## rank date featured
## Min. :-0.32849 Min. :-2.4176 0:959867
## 1st Qu.:-0.30672 1st Qu.:-0.6854 1: 883
## Median :-0.26318 Median : 0.1807
## Mean : 0.00000 Mean : 0.0000
## 3rd Qu.:-0.08174 3rd Qu.: 1.0468
## Max. :18.47513 Max. : 1.0468