Проект StackExchange, неделя 3

Цели текущей недели:

Начиная с этой недели, задание на неделю устанавливает минимальный уровень требований, ваша задача -- изучить и раскрыть особенности своего проекта, задавая содержательные вопросы и используя весь спектр доступных вам средств R и дополнительное чтение для получения на них ответов.

library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
badges <- read.csv("~/math_badges.csv", header=F)

names(badges) <- c("transID", "userID", "badge", "date")

badges[["userID"]] <- as.factor(badges[["userID"]])
badges[["badge"]] <- as.factor(badges[["badge"]])
badges[["date"]] <- as.POSIXct(badges[["date"]])
badges[["month"]] <- cut(badges[["date"]], "1 month")


by_badge <- group_by(badges, badge)

badge.freq <- dplyr::summarise(by_badge, user_per_badge=length(transID))
bf.sorted <- dplyr::arrange(badge.freq, user_per_badge)

user.count <- length(unique(badges$userID))

rare_badges <- filter(bf.sorted, user_per_badge <= user.count/100)


bf.sorted.desc <- dplyr::arrange(badge.freq, desc(user_per_badge))

most_pop_badges <- bf.sorted.desc$badge[1:10]

users_with_pop_badges <- length(
  unique(
    filter(badges, badge %in% most_pop_badges)$userID))

badges.interesting <- filter(badges, !(badge %in% most_pop_badges))
by_uid <- group_by(badges, userID)

user.badge.freq <- dplyr::summarise(by_uid, badge_per_user=length(transID))

ub.sorted.desc <- dplyr::arrange(user.badge.freq, desc(badge_per_user))

badgest_user <- ub.sorted.desc[1,]$userID

badges_of_user <- dplyr::filter(badges, userID==badgest_user)


require(ggplot2)
## Loading required package: ggplot2

ggplot(badges_of_user, aes(date, ..count..)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-2


ggplot(badges, aes(x=badge, ..count..), stat=identity) + geom_bar()
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk unnamed-chunk-2


ggplot(bf.sorted.desc, aes(x=reorder(badge, -user_per_badge), y=user_per_badge), stat=identity) + geom_bar()
## Mapping a variable to y and also using stat="bin".
##   With stat="bin", it will attempt to set the y value to the count of cases in each group.
##   This can result in unexpected behavior and will not be allowed in a future version of ggplot2.
##   If you want y to represent counts of cases, use stat="bin" and don't map a variable to y.
##   If you want y to represent values in the data, use stat="identity".
##   See ?geom_bar for examples. (Deprecated; last used in version 0.9.2)
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk unnamed-chunk-2



bf.sorted.desc$group_break <- cut(bf.sorted.desc$user_per_badge, breaks=4, labels=c("cg1","cg2","cg3","cg4"))

bf.sorted.desc$kmeans_break <- kmeans(bf.sorted.desc$user_per_badge, centers=4)$cluster


ggplot(badges_of_user.groups, aes(date, fill=group_break)) + geom_density()
## Error: object 'badges_of_user.groups' not found