library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
interactions.v01 <- read.csv("~/Desktop/interactions-v01.csv")
summary(interactions.v01)
## user_id item_id interaction_type week
## Min. : 7 Min. : 5 Min. :1.000 Min. :33.00
## 1st Qu.: 750673 1st Qu.: 728099 1st Qu.:1.000 1st Qu.:36.00
## Median :1492171 Median :1422495 Median :1.000 Median :39.00
## Mean :1491789 Mean :1426966 Mean :1.464 Mean :38.85
## 3rd Qu.:2232266 3rd Qu.:2135364 3rd Qu.:1.000 3rd Qu.:42.00
## Max. :2984888 Max. :2846369 Max. :4.000 Max. :45.00
length(unique(interactions.v01$user_id))
## [1] 784687
length(unique(interactions.v01$item_id))
## [1] 1029480
length(unique(interactions.v01$week))
## [1] 13
O número de iterações por semana são bem regulares, com exceção da primeira e última
par(mfrow=c(1,2))
plot(main="USER ID",density(interactions.v01$user_id))
plot(main="ITEM ID",density(interactions.v01$item_id))
hist(main='INTERACTION',interactions.v01$interaction_type)
plot(main="WEEK",density(interactions.v01$week))
interactions <- summarise(group_by(interactions.v01,interaction_type),count=n())
interactions <- transform(interactions,ratio=count/sum(interactions$count))
c <- c('click','bookmarked','apply','delete')
for (i in 1:4){
interactions$interaction_type[interactions$interaction_type==i] <- c[i]
}
interactions
## interaction_type count ratio
## 1 click 7183038 0.81378725
## 2 bookmarked 206191 0.02335998
## 3 apply 422026 0.04781255
## 4 delete 1015423 0.11504022
Click representa mais de 81% das interações o número de aplicações é inferior a 5%
Como podemos perceber o número de outliers é tão expressivo que não conseguimos observar a localização da mediana
items_ <- summarise(group_by(interactions.v01,item_id),count=n())
items_ <- items_[order(items_$count),]
head(items_)
## Source: local data frame [6 x 2]
##
## item_id count
## (int) (int)
## 1 5 1
## 2 19 1
## 3 22 1
## 4 39 1
## 5 40 1
## 6 59 1
boxplot(items_$count)
length(items_[items_$count > 50,]$count)/length(items_$count)
## [1] 0.0236906
length(items_[items_$count == 1,]$count)/length(items_$count)
## [1] 0.3168804
length(items_[items_$count <= 3,]$count)/length(items_$count)
## [1] 0.5823804
Concluimos que 32% dos items apenas um usuário iteragiu com o item. Um valor bastante baixo e se considerarmos no gráfico e nas informações acima quase 60% dos items houve interação com até 3 usuários
length(unique(interactions.v01[interactions.v01$interaction_type==4,]$item_id))/length(unique(interactions.v01$item_id))
## [1] 0.2096631
length(unique(interactions.v01[interactions.v01$interaction_type==2 | interactions.v01$interaction_type==3,]$item_id))/length(unique(interactions.v01$item_id))
## [1] 0.2280734
22% dos items teve usuários que favoritou ou applicou para a oferta. Número superior ao de deletações que é de 20.9%
item_week <- summarise(group_by(interactions.v01,item_id,week),count=n())
item_week <- item_week[order(-item_week$count),]
head(item_week)
## Source: local data frame [6 x 3]
## Groups: item_id [4]
##
## item_id week count
## (int) (int) (int)
## 1 1053452 38 5057
## 2 1053452 39 4936
## 3 2778525 44 3949
## 4 1007923 38 3373
## 5 1244196 44 2788
## 6 1007923 37 2633
Número de interações por semana
it <- summarise(group_by(item_week,week),count=sum(count))
top_weeks <- summarise(group_by(it,week),count=sum(count))
top_weeks <- top_weeks[order(-top_weeks$count),]
head(top_weeks)
## Source: local data frame [6 x 2]
##
## week count
## (int) (int)
## 1 38 822074
## 2 42 807675
## 3 37 797578
## 4 41 785063
## 5 39 781468
## 6 43 775954
iw <- item_week[item_week$count > 500,]
top_weeks <- summarise(group_by(iw,week),count=sum(count))
top_weeks <- top_weeks[order(-top_weeks$count),]
head(top_weeks)
## Source: local data frame [6 x 2]
##
## week count
## (int) (int)
## 1 38 28913
## 2 39 27473
## 3 40 25759
## 4 43 24182
## 5 42 21831
## 6 41 20694
item_count <- summarise(group_by(item_week,item_id),count=sum(count))
item_count <- item_count[order(-item_count$count),]
head(item_count)
## Source: local data frame [6 x 2]
##
## item_id count
## (int) (int)
## 1 1053452 13679
## 2 2268722 12627
## 3 1007923 10534
## 4 2778525 8496
## 5 1729618 7144
## 6 1244196 6809
users_v01 <- read.csv("~/Desktop/users_v01.csv")
users_b <- users_v01
c <- c('desconhecido','Sudent/Intern','Entry Level','Professional','Manager','Executive','Senior Executive')
users_v01$career_level <- as.character(users_v01$career_level)
for (i in 1:7){
users_v01$career_level[users_v01$career_level==i-1] <- c[i]
}
users_v01$career_level <- as.factor(users_v01$career_level)
summary(users_v01$career_level)
## desconhecido Entry Level Executive Manager
## 912216 86782 158857 472007
## NULL Professional Senior Executive Sudent/Intern
## 320223 757705 210642 16617
length(users_v01[users_v01$career_level=='NULL',]$career_level)/length(users_v01$career_level)
## [1] 0.1091031
users <- users_v01[users_v01$career_level != 'NULL',]
Existe um número expressivo de valores NULL e desconhecido
c <- c('Desconhecido','Graduado','Mestre','Doutor')
users_v01$edu_degree <- as.character(users_v01$edu_degree)
for (i in 1:4){
users_v01$edu_degree[users_v01$edu_degree==i-1] <- c[i]
}
users_v01$edu_degree <- as.factor(users_v01$edu_degree)
summary(users_v01$edu_degree)
## 4 5 6 7 8
## 23203 70515 13561 78339 196851
## 9 Desconhecido Doutor Graduado Mestre
## 25833 1438518 154015 241323 692547
## NULL
## 344
Como podemos observar, há alguns valores dos quais não estão presentes na descrição dos dados oferecido pelo XING Consideremos apenas os valores em que foi possivel nomear
users_b$edu_degree <- as.numeric(users_b$edu_degree)
users_b <- users_b[users_b$edu_degree>1 & users_b$edu_degree < 5,]
users_b$edu_degree <- as.factor(users_b$edu_degree)
users_b$career_level <- as.numeric(users_b$career_level)
users_b <- users_b[users_b$career_level > 2 & users_b$career_level < 8,]
cor(as.numeric(users_b$edu_degree),users_b$career_level)
## [1] 0.1835106
Podemos concluir que a correlação entre formação e nivel de carreira é muito baixa
summary(as.factor(as.character(users$discipline_id)))
## 0 1 10 11 12 13 14 15 16
## 1953834 8787 7474 10077 10522 21094 81090 8145 5780
## 17 18 19 2 20 21 22 23 3
## 16722 30275 25934 115884 21642 1770 7172 18541 110059
## 4 5 6 7 8 9
## 992 18441 31324 42926 9764 56577
par(mfrow=c(2,2))
hist(main="Industria",as.numeric(users$industry_id))
hist(main="Area",as.numeric(users$discipline_id))
summary(users$country)
## at ch de non_dach NULL
## 148494 174584 2191791 99957 0
hist(main="Regiao",as.numeric(users$region))
hist(main="CV",as.numeric(users$experience_n_entries_class))
hist(main="Experiencia",as.numeric(users$experience_years_experience))
hist(main="Fidelidade",as.numeric(users$experience_years_in_current))
hist(main="Formacao",as.numeric(users$edu_degree))
summary(as.factor(users$edu_fieldofstudies))
## 1 2 3 4 5 6 7 8 9
## 4377 54607 179055 32048 175511 11110 59249 328980 3563
## NA's
## 1766326
Nos gráficos acima podemos observar vários outliers
Podemos tambem observar que uma parcela consideravel passa menos de 3 anos no mesmo emprego
users_b <- users_v01
users_b$experience_years_in_current <- as.numeric(users_b$experience_years_in_current)
tres_anos <- users_b[users_b$experience_years_in_current > 1 & users_b$experience_years_in_current < 4,]
dez_anos <- users_b[users_b$experience_years_in_current > 6,]
summary(tres_anos)
## id jobroles career_level
## Min. : 9 Min. : 0 desconhecido:366125
## 1st Qu.: 749866 1st Qu.: 973920 Professional:354412
## Median :1493466 Median :2314529 Manager :227272
## Mean :1494326 Mean :2222261 NULL :109546
## 3rd Qu.:2238725 3rd Qu.:3408137 Entry Level : 65191
## Max. :2984888 Max. :4583506 Executive : 60153
## (Other) : 55056
## discipline_id industry_id country region
## Min. : 0.000 Min. : 0.000 at : 62976 0 :583499
## 1st Qu.: 0.000 1st Qu.: 0.000 ch : 82492 2 :133191
## Median : 0.000 Median : 2.000 de :1045091 9 :114034
## Mean : 3.111 Mean : 7.323 non_dach: 47079 1 :113849
## 3rd Qu.: 3.000 3rd Qu.:15.000 NULL : 117 7 : 81892
## Max. :23.000 Max. :23.000 6 : 73864
## (Other):137426
## experience_n_entries_class experience_years_experience
## 1 : 93998 4 :388016
## 2 :284383 5 :288591
## 3 :859374 6 :199223
## NULL: 0 7 :170100
## 3 :106261
## 2 : 69118
## (Other): 16446
## experience_years_in_current edu_degree edu_fieldofstudies
## Min. :2.000 Desconhecido:499645 Min. :1.0
## 1st Qu.:2.000 Mestre :334619 1st Qu.:3.0
## Median :3.000 Graduado :146416 Median :6.0
## Mean :2.593 8 : 98710 Mean :5.8
## 3rd Qu.:3.000 Doutor : 66303 3rd Qu.:8.0
## Max. :3.000 7 : 34803 Max. :9.0
## (Other) : 57259 NA's :781187
summary(dez_anos)
## id jobroles career_level
## Min. : 23 Min. : 0 desconhecido :55002
## 1st Qu.: 746562 1st Qu.: 736866 Professional :39144
## Median :1502550 Median :1851239 Senior Executive:37854
## Mean :1494151 Mean :2062349 Manager :22926
## 3rd Qu.:2246098 3rd Qu.:3378061 Executive :14531
## Max. :2984882 Max. :4583829 NULL :11370
## (Other) : 462
## discipline_id industry_id country region
## Min. : 0.000 Min. : 0.000 at : 11536 0 :97706
## 1st Qu.: 0.000 1st Qu.: 0.000 ch : 10417 2 :16624
## Median : 0.000 Median : 3.000 de :154488 9 :15309
## Mean : 1.009 Mean : 7.448 non_dach: 4737 1 :14752
## 3rd Qu.: 0.000 3rd Qu.:15.000 NULL : 111 7 :10884
## Max. :23.000 Max. :23.000 10 : 9661
## (Other):16353
## experience_n_entries_class experience_years_experience
## 1 :53929 7 :116647
## 2 :38894 6 : 57016
## 3 :80840 NULL : 7626
## NULL: 7626 0 : 0
## 1 : 0
## 2 : 0
## (Other): 0
## experience_years_in_current edu_degree edu_fieldofstudies
## Min. :7.000 Desconhecido:112957 Min. :1.00
## 1st Qu.:7.000 Mestre : 32603 1st Qu.:3.00
## Median :7.000 Doutor : 9129 Median :5.00
## Mean :7.486 8 : 8153 Mean :5.27
## 3rd Qu.:8.000 7 : 5639 3rd Qu.:8.00
## Max. :9.000 5 : 4066 Max. :9.00
## (Other) : 8742 NA's :134320
1 - Profissionais que estao a mais de 10 anos no mesmo emprego eles tem um nivel de carreira superior aos que passam ate 3 anos na mesma empresa.
2 - Entre os profissionais que estão a mais de 10 anos na mesma empresa não há nenhum graduado,declarado.
newitems <- read.csv("~/Documents/newitems.txt")
location <- summarise(group_by(newitems,latitude,longitude),count=n())
location <- location[order(-location$count),]
head(location)
## Source: local data frame [6 x 3]
## Groups: latitude [6]
##
## latitude longitude count
## (fctr) (fctr) (int)
## 1 NULL NULL 210025
## 2 52.5 13.4 149102
## 3 48.1 11.6 118351
## 4 53.6 10.0 96095
## 5 47.4 8.6 70935
## 6 50.1 8.7 65233
ct <- summarise(group_by(newitems,country),count=n())
ct <- transform(ct,ratio = ct$count/sum(ct$count))
ct
## country count ratio
## 1 at 242494 0.08568953
## 2 ch 279023 0.09859770
## 3 de 2155822 0.76179771
## 4 non_dach 152575 0.05391507
summary(as.factor(newitems$industry_id))
## 0 1 2 3 4 5 6 7 8
## 405 125901 34849 28403 13309 35371 13564 10714 111941
## 9 10 11 12 13 14 15 16 17
## 77164 37588 549167 29233 16061 54345 1290720 67509 12743
## 18 19 20 21 22 23
## 31230 23125 11813 184964 39925 29870