# Import the datasets into the Global Environment
users <- read.csv("streaming_users.csv")
activity <- read.csv("streaming_activity.csv")
head(users)
head(activity)
dim(users)
## [1] 180 5
dim(activity)
## [1] 180 4
names(users)
## [1] "user_id" "age" "gender"
## [4] "subscription_type" "region"
names(activity)
## [1] "user_id" "hours_watched" "favorite_genre" "rating_given"
str(users)
## 'data.frame': 180 obs. of 5 variables:
## $ user_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num 44 49 59 26 35 58 44 42 55 26 ...
## $ gender : chr "Female" "Female" "Male" "Female" ...
## $ subscription_type: chr "Premium" "Premium" "Premium" "Standard" ...
## $ region : chr "South" "West" "East" "North" ...
str(activity)
## 'data.frame': 180 obs. of 4 variables:
## $ user_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ hours_watched : int 20 51 31 10 31 33 42 21 23 30 ...
## $ favorite_genre: chr "Documentary" "Action" "Documentary" "Documentary" ...
## $ rating_given : int 5 4 5 -99 2 5 3 1 4 4 ...
users$subscription_type <- as.factor(users$subscription_type)
users$gender <- as.factor(users$gender)
activity$favorite_genre <- as.factor(activity$favorite_genre)
There are 14 missing values in age. ## Q9 The mean is activity rating is 3.4221
sum(is.na(users$age))
## [1] 14
activity$rating_given[activity$rating_given == -99] <- NA
mean(activity$rating_given, na.rm = TRUE)
## [1] 3.422078
The data structure is 180 by 8
data <- merge(users, activity, by = "user_id")
dim(data)
## [1] 180 8
str(data)
## 'data.frame': 180 obs. of 8 variables:
## $ user_id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ age : num 44 49 59 26 35 58 44 42 55 26 ...
## $ gender : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 2 1 1 2 2 ...
## $ subscription_type: Factor w/ 3 levels "Basic","Premium",..: 2 2 2 3 2 2 2 2 3 3 ...
## $ region : chr "South" "West" "East" "North" ...
## $ hours_watched : int 20 51 31 10 31 33 42 21 23 30 ...
## $ favorite_genre : Factor w/ 4 levels "Action","Comedy",..: 3 1 3 3 1 3 4 4 1 4 ...
## $ rating_given : int 5 4 5 NA 2 5 3 1 4 4 ...
14 missing values ## Q16 154 rows after cleaning ## Q18 Switching NAs with 0 adds no value to the sum while maintaining the total row count. This lowers the average rating down Dirty data results in 3.422 while clean is 2.927, a difference of 0.495. A 14.46% deduction to the average rating.
sum(is.na(data$age))
## [1] 14
dataMean <- mean(data$age, na.rm = TRUE)
data$age[is.na(data$age)] <- dataMean
dataMean
## [1] 38.20482
sum(is.na(data$age))
## [1] 0
data_clean <- data[!is.na(data$rating_given), ]
nrow(data_clean)
## [1] 154
data$rating_clean <- ifelse(is.na(data$rating_given), 0, data$rating_given)
mean(data$rating_given, na.rm=TRUE)
## [1] 3.422078
mean(data$rating_clean)
## [1] 2.927778
head() ## Q22 There are 69 users who have rating_given greater than or equal to 4 and hours_watched greater than 20.
prem_heavy <- data[data$hours_watched > 25 & data$subscription_type == "Premium", ]
prem_heavy_sorted <- prem_heavy[order(-prem_heavy$hours_watched), ]
prem_heavy_sorted
head(prem_heavy_sorted, 5)
nrow(data[data$rating_given >= 4 & data$hours_watched > 20, ])
## [1] 69
There are more than half at 0.5873 with a high_rating among heavy users
data$heavy_user <- ifelse (data$hours_watched > 30, "Yes", "No")
data$high_rating <- data$rating_given >= 4
mean(data$high_rating[data$heavy_user == "Yes"], na.rm = TRUE)
## [1] 0.5873016
East has 50, North 41, South 42, and West 47
They all average 25 except for the south which is 28. Intresting that it isn’t in the top heavy users. Maybe there is less premium use but more people in the south as opposed to other areas.
Premium has the highest average at over 30.4 hours.
aggregate(cbind(hours_watched, rating_given) ~ subscription_type, data = data, FUN = mean, na.rm = TRUE)
table(data$region)
##
## East North South West
## 50 41 42 47
aggregate(hours_watched ~ region, data = data, FUN = mean)
# Q29
agg_hrs <- aggregate(hours_watched ~ subscription_type, data = data, FUN = mean)
agg_hrs[which.max(agg_hrs$hours_watched), ]
num_vars <- data[, c("age", "hours_watched", "rating_given")]
apply(num_vars, 2, mean, na.rm = TRUE)
## age hours_watched rating_given
## 38.204819 26.227778 3.422078
Most watched_hours user_id is 2
which(data$hours_watched > 40)
## [1] 2 7 27 33 36 43 52 62 65 76 78 93 94 97 113 120 122 134 144
## [20] 165 170 178
data$user_id[which.max(data$hours_watched)]
## [1] 2
Documentary is the missing axis label, should be shortened to Doc or Docu
Premium watches the most content
hist(data$hours_watched,
main = "Hours Watched",
xlab = "Hours")
boxplot(hours_watched ~ subscription_type,
data = data,
main = "Subscriber's Watch Hours",
ylab = "Hours")
barplot(table(data$favorite_genre),
main = "Favorite Genres",
ylab = "Frequency")
plot(data$hours_watched,
data$rating_given,
main="Hours vs Rating",
xlab="Hours",
ylab="Rating")
The most watched subscription type is the premium level.
There looks to be an equilibrium going diagonally across the scatter plot. More hours watched mean less reviews but also higher ratings. There are levels where this path is distinct.
Imputing missing ratings with exactly 0 treats non-responses as negative feedback rather than null data. This artificially deflates the true average rating given by active reviewers.
agg_hrs$subscription_type[which.max(agg_hrs$hours_watched)]
## [1] Premium
## Levels: Basic Premium Standard