library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.5.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.3
## corrplot 0.95 loaded
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.3
steam <- read.csv("C:/Users/HP/Downloads/steam_games_2026.csv")
str(steam)
## 'data.frame': 1000 obs. of 12 variables:
## $ AppID : int 730 2868840 3321460 3065800 3764200 2157830 1144200 1172470 1808500 2852190 ...
## $ Name : chr "Counter-Strike 2" "Slay the Spire 2" "Crimson Desert" "Marathon" ...
## $ Release_Date : chr "2012-08-21" "2026-03-05" "2026-03-19" "2026-03-05" ...
## $ Primary_Genre : chr "Action" "Indie" "Action" "Action" ...
## $ All_Tags : chr "FPS;Shooter;Multiplayer;Competitive;Action;Team-Based;eSports;Tactical;First-Person;PvP" "Strategy;Roguelike;Card Game;Deckbuilding;Co-op;Roguelike Deckbuilder;Card Battler;Turn-Based Combat;Multiplaye"| __truncated__ "Action;Open World;Singleplayer;Adventure;Combat;Exploration;Dragons;Story Rich;Action-Adventure;Third Person" "Extraction Shooter;PvP;Multiplayer;Sci-fi;Shooter;FPS;Futuristic;Action;First-Person;PvE" ...
## $ Price_USD : num 0 25 70 40 70 ...
## $ Discount_Pct : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Review_Score_Pct : int 83 97 0 90 96 83 85 65 79 75 ...
## $ Total_Reviews : int 4980365 49549 0 24360 79667 2765 243810 1577 264150 1570 ...
## $ Steam_Deck_Status: chr "Unknown" "Unknown" "Unknown" "Unknown" ...
## $ Estimated_Owners : int 149410950 1486470 0 730800 2390010 82950 7314300 47310 7924500 47100 ...
## $ X24h_Peak_Players: int 1013936 0 0 0 0 0 4296 124262 0 0 ...
Interpretation: This dataset of STEAM contains games data across 1000 records and 12 columns. It includes information such as game name, release date, genre, price, reviews and estimated owners.
colSums(is.na(steam))
## AppID Name Release_Date Primary_Genre
## 0 0 0 0
## All_Tags Price_USD Discount_Pct Review_Score_Pct
## 0 0 0 0
## Total_Reviews Steam_Deck_Status Estimated_Owners X24h_Peak_Players
## 0 0 0 0
Interpretation: The dataset has no missing values, ensuring complete data for analysis.
result1 <- steam %>%
arrange(desc(X24h_Peak_Players)) %>%
select(Name, Price_USD, X24h_Peak_Players)
result1[1:10, ]
## Name Price_USD X24h_Peak_Players
## 1 Counter-Strike 2 0.00 1013936
## 2 Dota 2 0.00 623941
## 3 PUBG: BATTLEGROUNDS 0.00 314682
## 4 ELDEN RING NIGHTREIGN 39.99 163599
## 5 Rust 39.99 143870
## 6 Apex Legendsâ„¢ 0.00 124262
## 7 Marvel Rivals 0.00 106611
## 8 DELTARUNE 24.99 83936
## 9 Call of Duty® 0.00 67419
## 10 NARAKA: BLADEPOINT 0.00 66954
Interpretation: Top 10 games are counter strike, Dota 2, Pubg and more. Most of the games with high playtime are free.
result2 <- steam %>%
arrange(desc(Review_Score_Pct)) %>%
select(Name, Price_USD, Review_Score_Pct, Total_Reviews)
result2[1:10, ]
## Name Price_USD Review_Score_Pct
## 1 The Jackbox Party Pack 3 24.99 100
## 2 STAR WARSâ„¢ Empire at War - Gold Pack 19.99 100
## 3 Funi Raccoon Game 10.79 99
## 4 Creature Kitchen 7.99 99
## 5 Crab Champions 6.99 99
## 6 Sam & Max: Beyond Time and Space 9.99 99
## 7 Opus Magnum 19.99 99
## 8 Stardew Valley 14.99 98
## 9 DREDGE 9.99 98
## 10 Subnautica 29.99 98
## Total_Reviews
## 1 5065
## 2 34891
## 3 463
## 4 3126
## 5 25466
## 6 428
## 7 5304
## 8 846150
## 9 43759
## 10 308655
Interpretation: Highest rated games are mostly paid but total number of reviews are less compared to free games.
result3 <- steam %>%
mutate(freevspaid = ifelse(Price_USD == 0, "FREE", "PAID")) %>%
group_by(freevspaid) %>%
summarise(total_games = n())
result3
## # A tibble: 2 × 2
## freevspaid total_games
## <chr> <int>
## 1 FREE 108
## 2 PAID 892
Interpretation: There are 108 free and 892 paid games.
result4 <- steam %>%
filter(Price_USD > 0) %>%
group_by(Primary_Genre) %>%
summarise(total_sales = sum(Price_USD), total_Reviews = sum(Total_Reviews)) %>%
arrange(desc(total_Reviews))
result4
## # A tibble: 12 × 3
## Primary_Genre total_sales total_Reviews
## <chr> <dbl> <int>
## 1 Action 13343. 37090767
## 2 Indie 1237. 4234764
## 3 Adventure 1906. 3925693
## 4 RPG 1618. 3022916
## 5 Casual 986. 2716910
## 6 Simulation 1372. 1487554
## 7 Strategy 708. 1332337
## 8 Racing 671. 595802
## 9 Massively Multiplayer 102. 196181
## 10 Sports 20.0 12634
## 11 Early Access 30.0 11447
## 12 Unknown 0.49 0
Interpretation: Action games are most sold and are most popular among the other generes.
steam$Release_Date <- as.Date(steam$Release_Date)
steam$year = year(steam$Release_Date)
result5 <- steam %>%
group_by(year) %>%
summarise(total_games = n())
result5
## # A tibble: 22 × 2
## year total_games
## <dbl> <int>
## 1 2006 1
## 2 2007 2
## 3 2008 3
## 4 2009 6
## 5 2010 5
## 6 2011 15
## 7 2012 32
## 8 2013 17
## 9 2014 12
## 10 2015 24
## # ℹ 12 more rows
Interpretation: Most of the games were released in between 2021 to 2026.
steam %>%
filter(Estimated_Owners > 0) %>%
group_by(Primary_Genre) %>%
summarise(avg_owners = mean(Estimated_Owners))
## # A tibble: 11 × 2
## Primary_Genre avg_owners
## <chr> <dbl>
## 1 Action 2654427.
## 2 Adventure 1286517.
## 3 Casual 1482257.
## 4 Early Access 171705
## 5 Indie 2082723.
## 6 Massively Multiplayer 545468.
## 7 RPG 1374053.
## 8 Racing 1117129.
## 9 Simulation 992971.
## 10 Sports 379020
## 11 Strategy 1642166.
steam <- steam %>%
mutate(rating = case_when(
Review_Score_Pct <= 20 ~ "*",
Review_Score_Pct > 20 & Review_Score_Pct <=40 ~ "**",
Review_Score_Pct >40 & Review_Score_Pct <=60 ~ "***",
Review_Score_Pct >60 & Review_Score_Pct <=80 ~ "****",
Review_Score_Pct >80 ~ "*****",
))
steam <- steam %>%
mutate(category = case_when(
Price_USD == 0 ~ "FREE",
Price_USD >0 & Price_USD < 25 ~ "AFFORDABLE",
Price_USD >= 25 ~ "PREMIUIM"
))
steam %>%
select(Name, Price_USD, rating, category) %>%
head(10)
## Name Price_USD rating category
## 1 Counter-Strike 2 0.00 ***** FREE
## 2 Slay the Spire 2 24.99 ***** AFFORDABLE
## 3 Crimson Desert 69.99 * PREMIUIM
## 4 Marathon 39.99 ***** PREMIUIM
## 5 Resident Evil Requiem 69.99 ***** PREMIUIM
## 6 John Carpenter's Toxic Commando 39.99 ***** PREMIUIM
## 7 Ready or Not 49.99 ***** PREMIUIM
## 8 Apex Legendsâ„¢ 0.00 **** FREE
## 9 ARC Raiders 39.99 **** PREMIUIM
## 10 Monster Hunter Stories 3: Twisted Reflection 69.99 **** PREMIUIM
Interpretation: rating is given as 1-5 stars to the games according the review scores and games are separated into category as free, affordable and premium based on price of the games.
ggplot(result5, aes(x = year, y = total_games)) +
geom_line(color = "blue", linewidth = 1) +
labs(title = "Number of Games Released Each Year",
x = "Year",
y = "Number of Games")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
Interpretation: There was a rise in release of games after 2020.
ggplot(result3, aes(x = freevspaid, y = total_games, fill = freevspaid)) +
geom_bar(stat = "identity") +
labs(title = "how many games are free and paid",
x = "free vs paid",
y = "number of games")
Interpretation: most of the games are paid showing mostly premium set of games are available at steam. ## Bar Chart: Sales of the games according to different genre
ggplot(result4, aes(x = Primary_Genre, y = total_sales, fill = Primary_Genre))+
geom_bar(stat = "identity") +
labs(title = "sales of games of different genre",
x = "Genre",
y = "sales")
Interpretation: Most of the games sold are of action genre.
scatt <- steam %>%
mutate(owners = Estimated_Owners/100, players = X24h_Peak_Players/100, reviews = Total_Reviews/100)
ggplot(scatt, aes(x = owners, y = players)) +
geom_point() +
labs(title = "Total owners vs Total players playtime")
Interpretation: There are more owners than the playtime and there are some outliers.
ggplot(steam, aes(x = Price_USD, y = Total_Reviews)) +
geom_point(alpha = 0.6) +
labs(title = "Price vs Reviews")
Interpretation: No strong linear relationship exists between price and reviews, suggesting that popularity is influenced more by gameplay quality and marketing rather than cost.
result6 <- steam %>%
filter(Price_USD >0)
ggplot(result6, aes(x = Price_USD)) +
geom_histogram(
aes(y = ..density..),
binwidth = 1,
fill = "#2fa",
color = "black",
alpha = 0.7
) +
geom_density(color = "blue", size = 1.5, adjust = 1.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Interpretation: data of price is slightly right skewed
result8 <- steam %>%
group_by(rating) %>%
summarise(total = n())
pie(result8$total, labels = result8$rating, col = c("#234", "#3c2", "#986", "#097", "#fc2"),
main = "proportion of games by rating")
legend("topright", legend = result8$rating, fill = c("#234", "#3c2", "#986", "#097", "#fc2"))
Interpretation: Yellow segment indicates most games received 5 star rating
pair_data <- steam[, c("Price_USD", "Total_Reviews", "Estimated_Owners", "X24h_Peak_Players")]
ggpairs(pair_data)
ggplot(steam, aes(group = Primary_Genre, x = Primary_Genre, y = Review_Score_Pct, fill = Primary_Genre)) +
geom_boxplot(outlier.colour = "darkgreen", outlier.shape = 16) +
labs(title = "reviews variability across different genres",
x = "GENRE",
y = "REVIEWS") +
theme_minimal()
Interpretation: simulation and unknown genre exhibit higher medians compared to other genres having low medians and variability. Action and RPG have more outliers.
ggplot(steam, aes(group = Primary_Genre, x = Primary_Genre, y = Price_USD, fill = Primary_Genre)) +
geom_boxplot(outlier.colour = "darkgreen", outlier.shape = 16) +
labs(title = "Price variability across different genres",
x = "GENRE",
y = "Price($USD)") +
theme_minimal()
Interpretation: Racing , simulation, action and unknown genre exhibit higher medians compared to other genres having low medians and variability. adventure and casual have some outliers.
steam_model <- lm(Total_Reviews ~ Estimated_Owners ,data = steam)
summary(steam_model)
## Warning in summary.lm(steam_model): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = Total_Reviews ~ Estimated_Owners, data = steam)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.149e-09 -5.530e-12 -6.200e-13 1.114e-11 8.023e-11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.473e-11 2.312e-12 -6.371e+00 2.87e-10 ***
## Estimated_Owners 3.333e-02 3.488e-19 9.557e+16 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.997e-11 on 998 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 9.134e+33 on 1 and 998 DF, p-value: < 2.2e-16
predict_data <- data.frame(Estimated_Owners = c(23456, 60000, 1000, 8500))
predict(steam_model, predict_data)
## 1 2 3 4
## 781.86667 2000.00000 33.33333 283.33333
ggplot(scatt, aes(x = owners, y = reviews)) +
geom_point(color = "#fc2", size = 3) +
stat_smooth(method = "lm",
formula = y ~ x,
color = "blue",
size = 1.5,
se = FALSE) +
labs(title = "Estimated Owners vs Total reviews (simple linear model)",
x = "estimated owners",
y = " total reviews") +
theme_minimal()
steam_cor <- steam[, c("Price_USD", "Review_Score_Pct", "Total_Reviews", "Estimated_Owners", "X24h_Peak_Players")]
cor_steam <- cor(steam_cor)
corrplot(cor_steam, method = "color", addCoef.col = "#111", number.cex = 0.7)
Interpretation: There is a strong positive correlation between Estimated Owners and Peak Players, indicating that games with a larger user base tend to have higher active player counts. However, price shows weak correlation with review scores, suggesting that higher cost does not guarantee better quality.