# Load necessary libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)

# File paths
movies_file <- "C:/Users/Nao Imayuki/OneDrive/Desktop/機械学習/最終課題/ml-1m/ml-1m/movies.dat"
ratings_file <- "C:/Users/Nao Imayuki/OneDrive/Desktop/機械学習/最終課題/ml-1m/ml-1m/ratings.dat"
users_file <- "C:/Users/Nao Imayuki/OneDrive/Desktop/機械学習/最終課題/ml-1m/ml-1m/users.dat"

# Read the data
# Read the files line by line and split them manually
movies_raw <- readLines(movies_file, encoding = "latin1")
ratings_raw <- readLines(ratings_file, encoding = "latin1")
users_raw <- readLines(users_file, encoding = "latin1")

# Split the lines into columns
movies <- do.call(rbind, strsplit(movies_raw, "::", fixed = TRUE))
ratings <- do.call(rbind, strsplit(ratings_raw, "::", fixed = TRUE))
users <- do.call(rbind, strsplit(users_raw, "::", fixed = TRUE))

# Convert to data frames
movies <- as.data.frame(movies, stringsAsFactors = FALSE)
colnames(movies) <- c("MovieID", "Title", "Genres")

ratings <- as.data.frame(ratings, stringsAsFactors = FALSE)
colnames(ratings) <- c("UserID", "MovieID", "Rating", "Timestamp")

users <- as.data.frame(users, stringsAsFactors = FALSE)
colnames(users) <- c("UserID", "Gender", "Age", "Occupation", "Zip-code")

# Convert numeric columns
ratings$Rating <- as.numeric(ratings$Rating)
ratings$MovieID <- as.numeric(ratings$MovieID)
ratings$UserID <- as.numeric(ratings$UserID)

users$Age <- as.numeric(users$Age)
users$UserID <- as.numeric(users$UserID)

# Display the first few rows
head(movies)
##   MovieID                              Title                       Genres
## 1       1                   Toy Story (1995)  Animation|Children's|Comedy
## 2       2                     Jumanji (1995) Adventure|Children's|Fantasy
## 3       3            Grumpier Old Men (1995)               Comedy|Romance
## 4       4           Waiting to Exhale (1995)                 Comedy|Drama
## 5       5 Father of the Bride Part II (1995)                       Comedy
## 6       6                        Heat (1995)        Action|Crime|Thriller
head(ratings)
##   UserID MovieID Rating Timestamp
## 1      1    1193      5 978300760
## 2      1     661      3 978302109
## 3      1     914      3 978301968
## 4      1    3408      4 978300275
## 5      1    2355      5 978824291
## 6      1    1197      3 978302268
head(users)
##   UserID Gender Age Occupation Zip-code
## 1      1      F   1         10    48067
## 2      2      M  56         16    70072
## 3      3      M  25         15    55117
## 4      4      M  45          7    02460
## 5      5      M  25         20    55455
## 6      6      F  50          9    55117
# Convert MovieID in both data frames to the same type
movies$MovieID <- as.numeric(movies$MovieID)  # Ensure MovieID in movies is numeric
ratings$MovieID <- as.numeric(ratings$MovieID)  # Ensure MovieID in ratings is numeric

# Now perform the join
top_movies <- ratings %>%
  group_by(MovieID) %>%
  summarise(AverageRating = mean(Rating), Count = n()) %>%
  arrange(desc(AverageRating), desc(Count)) %>%
  slice(1:10) %>%
  left_join(movies, by = "MovieID")

# Plot top-rated movies
ggplot(top_movies, aes(x = reorder(Title, AverageRating), y = AverageRating)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = "Top-Rated Movies", x = "Movie Title", y = "Average Rating")

# Save the plot
ggsave("Top_Rated_Movies.jpg")
## Saving 7 x 5 in image

Top-Rated and Most-Rated Movies

# Convert MovieID to numeric to avoid join issues
movies$MovieID <- as.numeric(movies$MovieID)
ratings$MovieID <- as.numeric(ratings$MovieID)

# Calculate statistics
movie_stats <- ratings %>%
  group_by(MovieID) %>%
  summarise(AverageRating = mean(Rating), RatingCount = n()) %>%
  left_join(movies, by = "MovieID")

# Top-rated movies
top_rated <- movie_stats %>%
  arrange(desc(AverageRating)) %>%
  slice(1:10)

ggplot(top_rated, aes(x = reorder(Title, AverageRating), y = AverageRating)) +
  geom_bar(stat = "identity", fill = "lightgreen") +
  coord_flip() +
  labs(title = "Top 10 Rated Movies", x = "Movie Title", y = "Average Rating")

# Most-rated movies
most_rated <- movie_stats %>%
  arrange(desc(RatingCount)) %>%
  slice(1:10)

ggplot(most_rated, aes(x = reorder(Title, RatingCount), y = RatingCount)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  coord_flip() +
  labs(title = "Top 10 Most Rated Movies", x = "Movie Title", y = "Rating Count")

Clustering (クラスタリング)

# Create user-item rating matrix
user_item_matrix <- ratings %>%
  select(UserID, MovieID, Rating) %>%
  spread(key = MovieID, value = Rating, fill = 0)

# Perform k-means clustering
set.seed(42)
clusters <- kmeans(user_item_matrix[,-1], centers = 3)  # 3 clusters

# Add cluster information
user_clusters <- data.frame(UserID = user_item_matrix$UserID, Cluster = clusters$cluster)

# Visualize clusters
ggplot(user_clusters, aes(x = UserID, y = Cluster, color = as.factor(Cluster))) +
  geom_point() +
  labs(title = "User Clustering", x = "User ID", y = "Cluster Group")

Time Series of Ratings (時系列分析)

# Ensure the Timestamp column is numeric
ratings$Timestamp <- as.numeric(ratings$Timestamp)

# Convert Timestamp to date
ratings$Date <- as.POSIXct(ratings$Timestamp, origin = "1970-01-01")

# Aggregate ratings over time
time_series <- ratings %>%
  group_by(Date = as.Date(Date)) %>%
  summarise(AverageRating = mean(Rating), TotalRatings = n())

# Plot average rating over time
ggplot(time_series, aes(x = Date, y = AverageRating)) +
  geom_line(color = "blue") +
  labs(title = "Average Rating Over Time", x = "Date", y = "Average Rating")

# Plot total ratings over time
ggplot(time_series, aes(x = Date, y = TotalRatings)) +
  geom_line(color = "red") +
  labs(title = "Total Ratings Over Time", x = "Date", y = "Number of Ratings")

Predictive Accuracy Metrics (予測精度評価指標)

# Split data into training and test sets
set.seed(42)
train_index <- sample(1:nrow(ratings), 0.8 * nrow(ratings))
train <- ratings[train_index, ]
test <- ratings[-train_index, ]

# Predict using average rating (simple baseline model)
avg_rating <- mean(train$Rating)
test$Prediction <- avg_rating

# Calculate RMSE
rmse <- sqrt(mean((test$Rating - test$Prediction)^2))
cat("RMSE of baseline model:", rmse, "\n")
## RMSE of baseline model: 1.119537

Genre Analysis

# Expand genres into individual rows
genre_expanded <- movies %>%
  separate_rows(Genres, sep = "\\|") %>%
  inner_join(ratings, by = "MovieID") %>%
  group_by(Genres) %>%
  summarise(AverageRating = mean(Rating), RatingCount = n()) %>%
  arrange(desc(RatingCount))
## Warning in inner_join(., ratings, by = "MovieID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 41 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Plot genre popularity
ggplot(genre_expanded, aes(x = reorder(Genres, RatingCount), y = RatingCount)) +
  geom_bar(stat = "identity", fill = "purple") +
  coord_flip() +
  labs(title = "Genre Popularity", x = "Genre", y = "Rating Count")