# Load necessary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
# File paths
movies_file <- "C:/Users/Nao Imayuki/OneDrive/Desktop/機械学習/最終課題/ml-1m/ml-1m/movies.dat"
ratings_file <- "C:/Users/Nao Imayuki/OneDrive/Desktop/機械学習/最終課題/ml-1m/ml-1m/ratings.dat"
users_file <- "C:/Users/Nao Imayuki/OneDrive/Desktop/機械学習/最終課題/ml-1m/ml-1m/users.dat"
# Read the data
# Read the files line by line and split them manually
movies_raw <- readLines(movies_file, encoding = "latin1")
ratings_raw <- readLines(ratings_file, encoding = "latin1")
users_raw <- readLines(users_file, encoding = "latin1")
# Split the lines into columns
movies <- do.call(rbind, strsplit(movies_raw, "::", fixed = TRUE))
ratings <- do.call(rbind, strsplit(ratings_raw, "::", fixed = TRUE))
users <- do.call(rbind, strsplit(users_raw, "::", fixed = TRUE))
# Convert to data frames
movies <- as.data.frame(movies, stringsAsFactors = FALSE)
colnames(movies) <- c("MovieID", "Title", "Genres")
ratings <- as.data.frame(ratings, stringsAsFactors = FALSE)
colnames(ratings) <- c("UserID", "MovieID", "Rating", "Timestamp")
users <- as.data.frame(users, stringsAsFactors = FALSE)
colnames(users) <- c("UserID", "Gender", "Age", "Occupation", "Zip-code")
# Convert numeric columns
ratings$Rating <- as.numeric(ratings$Rating)
ratings$MovieID <- as.numeric(ratings$MovieID)
ratings$UserID <- as.numeric(ratings$UserID)
users$Age <- as.numeric(users$Age)
users$UserID <- as.numeric(users$UserID)
# Display the first few rows
head(movies)
## MovieID Title Genres
## 1 1 Toy Story (1995) Animation|Children's|Comedy
## 2 2 Jumanji (1995) Adventure|Children's|Fantasy
## 3 3 Grumpier Old Men (1995) Comedy|Romance
## 4 4 Waiting to Exhale (1995) Comedy|Drama
## 5 5 Father of the Bride Part II (1995) Comedy
## 6 6 Heat (1995) Action|Crime|Thriller
head(ratings)
## UserID MovieID Rating Timestamp
## 1 1 1193 5 978300760
## 2 1 661 3 978302109
## 3 1 914 3 978301968
## 4 1 3408 4 978300275
## 5 1 2355 5 978824291
## 6 1 1197 3 978302268
head(users)
## UserID Gender Age Occupation Zip-code
## 1 1 F 1 10 48067
## 2 2 M 56 16 70072
## 3 3 M 25 15 55117
## 4 4 M 45 7 02460
## 5 5 M 25 20 55455
## 6 6 F 50 9 55117
# Convert MovieID in both data frames to the same type
movies$MovieID <- as.numeric(movies$MovieID) # Ensure MovieID in movies is numeric
ratings$MovieID <- as.numeric(ratings$MovieID) # Ensure MovieID in ratings is numeric
# Now perform the join
top_movies <- ratings %>%
group_by(MovieID) %>%
summarise(AverageRating = mean(Rating), Count = n()) %>%
arrange(desc(AverageRating), desc(Count)) %>%
slice(1:10) %>%
left_join(movies, by = "MovieID")
# Plot top-rated movies
ggplot(top_movies, aes(x = reorder(Title, AverageRating), y = AverageRating)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Top-Rated Movies", x = "Movie Title", y = "Average Rating")

# Save the plot
ggsave("Top_Rated_Movies.jpg")
## Saving 7 x 5 in image
Top-Rated and Most-Rated Movies
# Convert MovieID to numeric to avoid join issues
movies$MovieID <- as.numeric(movies$MovieID)
ratings$MovieID <- as.numeric(ratings$MovieID)
# Calculate statistics
movie_stats <- ratings %>%
group_by(MovieID) %>%
summarise(AverageRating = mean(Rating), RatingCount = n()) %>%
left_join(movies, by = "MovieID")
# Top-rated movies
top_rated <- movie_stats %>%
arrange(desc(AverageRating)) %>%
slice(1:10)
ggplot(top_rated, aes(x = reorder(Title, AverageRating), y = AverageRating)) +
geom_bar(stat = "identity", fill = "lightgreen") +
coord_flip() +
labs(title = "Top 10 Rated Movies", x = "Movie Title", y = "Average Rating")

# Most-rated movies
most_rated <- movie_stats %>%
arrange(desc(RatingCount)) %>%
slice(1:10)
ggplot(most_rated, aes(x = reorder(Title, RatingCount), y = RatingCount)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() +
labs(title = "Top 10 Most Rated Movies", x = "Movie Title", y = "Rating Count")

Clustering (クラスタリング)
# Create user-item rating matrix
user_item_matrix <- ratings %>%
select(UserID, MovieID, Rating) %>%
spread(key = MovieID, value = Rating, fill = 0)
# Perform k-means clustering
set.seed(42)
clusters <- kmeans(user_item_matrix[,-1], centers = 3) # 3 clusters
# Add cluster information
user_clusters <- data.frame(UserID = user_item_matrix$UserID, Cluster = clusters$cluster)
# Visualize clusters
ggplot(user_clusters, aes(x = UserID, y = Cluster, color = as.factor(Cluster))) +
geom_point() +
labs(title = "User Clustering", x = "User ID", y = "Cluster Group")

Time Series of Ratings (時系列分析)
# Ensure the Timestamp column is numeric
ratings$Timestamp <- as.numeric(ratings$Timestamp)
# Convert Timestamp to date
ratings$Date <- as.POSIXct(ratings$Timestamp, origin = "1970-01-01")
# Aggregate ratings over time
time_series <- ratings %>%
group_by(Date = as.Date(Date)) %>%
summarise(AverageRating = mean(Rating), TotalRatings = n())
# Plot average rating over time
ggplot(time_series, aes(x = Date, y = AverageRating)) +
geom_line(color = "blue") +
labs(title = "Average Rating Over Time", x = "Date", y = "Average Rating")

# Plot total ratings over time
ggplot(time_series, aes(x = Date, y = TotalRatings)) +
geom_line(color = "red") +
labs(title = "Total Ratings Over Time", x = "Date", y = "Number of Ratings")

Predictive Accuracy Metrics (予測精度評価指標)
# Split data into training and test sets
set.seed(42)
train_index <- sample(1:nrow(ratings), 0.8 * nrow(ratings))
train <- ratings[train_index, ]
test <- ratings[-train_index, ]
# Predict using average rating (simple baseline model)
avg_rating <- mean(train$Rating)
test$Prediction <- avg_rating
# Calculate RMSE
rmse <- sqrt(mean((test$Rating - test$Prediction)^2))
cat("RMSE of baseline model:", rmse, "\n")
## RMSE of baseline model: 1.119537
Genre Analysis
# Expand genres into individual rows
genre_expanded <- movies %>%
separate_rows(Genres, sep = "\\|") %>%
inner_join(ratings, by = "MovieID") %>%
group_by(Genres) %>%
summarise(AverageRating = mean(Rating), RatingCount = n()) %>%
arrange(desc(RatingCount))
## Warning in inner_join(., ratings, by = "MovieID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 41 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Plot genre popularity
ggplot(genre_expanded, aes(x = reorder(Genres, RatingCount), y = RatingCount)) +
geom_bar(stat = "identity", fill = "purple") +
coord_flip() +
labs(title = "Genre Popularity", x = "Genre", y = "Rating Count")
