Цели текущей недели:
Начиная с этой недели, задание на неделю устанавливает минимальный уровень требований, ваша задача -- изучить и раскрыть особенности своего проекта, задавая содержательные вопросы и используя весь спектр доступных вам средств R и дополнительное чтение для получения на них ответов.
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
badges <- read.csv("~/math_badges.csv", header=F)
names(badges) <- c("transID", "userID", "badge", "date")
badges[["userID"]] <- as.factor(badges[["userID"]])
badges[["badge"]] <- as.factor(badges[["badge"]])
badges[["date"]] <- as.POSIXct(badges[["date"]])
badges[["month"]] <- cut(badges[["date"]], "1 month")
by_badge <- group_by(badges, badge)
badge.freq <- dplyr::summarise(by_badge, user_per_badge=length(transID))
bf.sorted <- dplyr::arrange(badge.freq, user_per_badge)
user.count <- length(unique(badges$userID))
rare_badges <- filter(bf.sorted, user_per_badge <= user.count/100)
bf.sorted.desc <- dplyr::arrange(badge.freq, desc(user_per_badge))
most_pop_badges <- bf.sorted.desc$badge[1:10]
users_with_pop_badges <- length(
unique(
filter(badges, badge %in% most_pop_badges)$userID))
badges.interesting <- filter(badges, !(badge %in% most_pop_badges))
by_uid <- group_by(badges, userID)
user.badge.freq <- dplyr::summarise(by_uid, badge_per_user=length(transID))
ub.sorted.desc <- dplyr::arrange(user.badge.freq, desc(badge_per_user))
badgest_user <- ub.sorted.desc[1,]$userID
badges_of_user <- dplyr::filter(badges, userID==badgest_user)
require(ggplot2)
## Loading required package: ggplot2
ggplot(badges_of_user, aes(date, ..count..)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(badges, aes(x=badge, ..count..), stat=identity) + geom_bar()
## Warning: position_stack requires constant width: output may be incorrect
ggplot(bf.sorted.desc, aes(x=reorder(badge, -user_per_badge), y=user_per_badge), stat=identity) + geom_bar()
## Mapping a variable to y and also using stat="bin".
## With stat="bin", it will attempt to set the y value to the count of cases in each group.
## This can result in unexpected behavior and will not be allowed in a future version of ggplot2.
## If you want y to represent counts of cases, use stat="bin" and don't map a variable to y.
## If you want y to represent values in the data, use stat="identity".
## See ?geom_bar for examples. (Deprecated; last used in version 0.9.2)
## Warning: position_stack requires constant width: output may be incorrect
bf.sorted.desc$group_break <- cut(bf.sorted.desc$user_per_badge, breaks=4, labels=c("cg1","cg2","cg3","cg4"))
bf.sorted.desc$kmeans_break <- kmeans(bf.sorted.desc$user_per_badge, centers=4)$cluster
ggplot(badges_of_user.groups, aes(date, fill=group_break)) + geom_density()
## Error: object 'badges_of_user.groups' not found