vibhu.dagar2016@vitstudent.ac.in
VIT Vellore
==> Dataset for analyzing video game sales:
vgsales <- read.csv("C:/Program Files/RStudio/files/vgsales.csv")
View(vgsales)
summary(vgsales)
## Rank Name Platform
## Min. : 1 Need for Speed: Most Wanted: 12 DS :2163
## 1st Qu.: 4151 FIFA 14 : 9 PS2 :2161
## Median : 8300 LEGO Marvel Super Heroes : 9 PS3 :1329
## Mean : 8301 Madden NFL 07 : 9 Wii :1325
## 3rd Qu.:12450 Ratatouille : 9 X360 :1265
## Max. :16600 Angry Birds Star Wars : 8 PSP :1213
## (Other) :16542 (Other):7142
## Year Genre Publisher
## 2009 :1431 Action :3316 Electronic Arts : 1351
## 2008 :1428 Sports :2346 Activision : 975
## 2010 :1259 Misc :1739 Namco Bandai Games : 932
## 2007 :1202 Role-Playing:1488 Ubisoft : 921
## 2011 :1139 Shooter :1310 Konami Digital Entertainment: 832
## 2006 :1008 Adventure :1286 THQ : 715
## (Other):9131 (Other) :5113 (Other) :10872
## NA_Sales EU_Sales JP_Sales Other_Sales
## Min. : 0.0000 Min. : 0.0000 Min. : 0.00000 Min. : 0.00000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.: 0.00000 1st Qu.: 0.00000
## Median : 0.0800 Median : 0.0200 Median : 0.00000 Median : 0.01000
## Mean : 0.2647 Mean : 0.1467 Mean : 0.07778 Mean : 0.04806
## 3rd Qu.: 0.2400 3rd Qu.: 0.1100 3rd Qu.: 0.04000 3rd Qu.: 0.04000
## Max. :41.4900 Max. :29.0200 Max. :10.22000 Max. :10.57000
##
## Global_Sales
## Min. : 0.0100
## 1st Qu.: 0.0600
## Median : 0.1700
## Mean : 0.5374
## 3rd Qu.: 0.4700
## Max. :82.7400
##
library(psych)
describe(vgsales)
## vars n mean sd median trimmed mad min
## Rank 1 16598 8300.61 4791.85 8300.50 8300.56 6152.05 1.00
## Name* 2 16598 5795.86 3324.01 5864.50 5810.22 4270.63 1.00
## Platform* 3 16598 16.71 8.29 17.00 16.67 10.38 1.00
## Year* 4 16598 27.61 6.00 28.00 28.00 5.93 1.00
## Genre* 5 16598 5.93 3.76 6.00 5.86 5.93 1.00
## Publisher* 6 16598 299.40 181.98 329.00 303.97 272.80 1.00
## NA_Sales 7 16598 0.26 0.82 0.08 0.13 0.12 0.00
## EU_Sales 8 16598 0.15 0.51 0.02 0.06 0.03 0.00
## JP_Sales 9 16598 0.08 0.31 0.00 0.02 0.00 0.00
## Other_Sales 10 16598 0.05 0.19 0.01 0.02 0.01 0.00
## Global_Sales 11 16598 0.54 1.56 0.17 0.27 0.21 0.01
## max range skew kurtosis se
## Rank 16600.00 16599.00 0.00 -1.20 37.19
## Name* 11493.00 11492.00 -0.03 -1.21 25.80
## Platform* 31.00 30.00 -0.05 -1.00 0.06
## Year* 40.00 39.00 -0.86 1.68 0.05
## Genre* 12.00 11.00 0.07 -1.43 0.03
## Publisher* 579.00 578.00 -0.15 -1.40 1.41
## NA_Sales 41.49 41.49 18.80 648.86 0.01
## EU_Sales 29.02 29.02 18.87 755.71 0.00
## JP_Sales 10.22 10.22 11.20 194.15 0.00
## Other_Sales 10.57 10.57 24.23 1024.92 0.00
## Global_Sales 82.74 82.73 17.40 603.68 0.01
==> Yearly Production of video games:
plot(vgsales$Year, xlab = "Years", ylab = "No. of games",main="Yearly Distribution of Video Games" ,col="cyan")
==> Distribution of games with respect to consoles:
plot(vgsales$Platform, main = "Platform based distribution", xlab = "Platform", ylab = "No. of games", col="cyan")
==> Distribution of games with respect to genres:
plot(vgsales$Genre, main = "Genre based distribution", xlab = "Genre", ylab = "No. of games", col="cyan")
[ Here we see that the Action genre is the most popular amongst all ]
==> Distribution of games with respect to publishers:
plot(vgsales$Publisher, main = "Publisher based distribution", xlab = "Publisher", ylab = "No. of games")
==> Sales of games with respect to the platforms in North America:
plot(vgsales$Platform ~ vgsales$NA_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")
[ The above graph shows that xbox sales are maximum in North America ]
==> Sales of games with respect to the platforms in Europe:
plot(vgsales$Platform ~ vgsales$EU_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")
[ The above graph shows that xbox sales are maximum in Europe ]
==> Sales of games with respect to the platforms in Japan:
plot(vgsales$Platform ~ vgsales$JP_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")
[ The above graph shows that xbox sales are maximum in Japan ]
==> Sales of games with respect to the platforms Globaly:
plot(vgsales$Platform ~ vgsales$Global_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")
[ The above graph shows that xbox sales are maximum Globaly ]
==> Contingency tables regarding sales in different regions:
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplotMatrix(formula = ~ NA_Sales + EU_Sales + JP_Sales , cex=1, data=vgsales ,reg.line="" )
==> Distribution of sales in diferent regions:
min(vgsales$NA_Sales)
## [1] 0
max(vgsales$NA_Sales)
## [1] 41.49
median(vgsales$NA_Sales)
## [1] 0.08
min(vgsales$JP_Sales)
## [1] 0
max(vgsales$JP_Sales)
## [1] 10.22
median(vgsales$JP_Sales)
## [1] 0
min(vgsales$EU_Sales)
## [1] 0
max(vgsales$EU_Sales)
## [1] 29.02
median(vgsales$EU_Sales)
## [1] 0.02
min(vgsales$Other_Sales)
## [1] 0
max(vgsales$Other_Sales)
## [1] 10.57
median(vgsales$Other_Sales)
## [1] 0.01
min(vgsales$Global_Sales)
## [1] 0.01
max(vgsales$Global_Sales)
## [1] 82.74
median(vgsales$Global_Sales)
## [1] 0.17
==> Corrgram representing various disrtributions in the dataset:
library(corrgram)
corrgram ( vgsales , order = TRUE , lower.panel = panel.shade , upper.panel = panel.pie , text.panel = panel.txt , main = " corrgram for the distributions in the dataset ")
==> North American Sales distribution with respect to each year:
scatter.smooth(vgsales$Year, vgsales$NA_Sales,xlab="Year",ylab="NA_Sales",main="Year vs NA_Sales")
==> European Sales distribution with respect to each year:
scatter.smooth(vgsales$Year, vgsales$EU_Sales,xlab="Year",ylab="EU_Sales",main="Year vs EU_Sales")
==> Japanese Sales distribution with respect to each year:
scatter.smooth(vgsales$Year, vgsales$JP_Sales,xlab="Year",ylab="JP_Sales",main="Year vs JP_Sales")
==> Global Sales distribution with respect to each year:
scatter.smooth(vgsales$Year, vgsales$Global_Sales,xlab="Year",ylab="Global_Sales",main="Year vs Global_Sales")
==> Hypothesis: There is no significant change in NA_Sales with respect to genre and platform.
[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]
fit <- lm( NA_Sales ~ Genre , data = vgsales)
summary(fit)
##
## Call:
## lm(formula = NA_Sales ~ Genre, data = vgsales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.505 -0.236 -0.153 -0.015 41.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.264726 0.014081 18.801 < 2e-16 ***
## GenreAdventure -0.182455 0.026636 -6.850 7.65e-12 ***
## GenreFighting -0.001058 0.031202 -0.034 0.9729
## GenreMisc -0.028820 0.024007 -1.200 0.2300
## GenrePlatform 0.239846 0.030664 7.822 5.53e-15 ***
## GenrePuzzle -0.052045 0.036440 -1.428 0.1532
## GenreRacing 0.023041 0.026919 0.856 0.3921
## GenreRole-Playing -0.044779 0.025300 -1.770 0.0768 .
## GenreShooter 0.180007 0.026460 6.803 1.06e-11 ***
## GenreSimulation -0.053295 0.030928 -1.723 0.0849 .
## GenreSports 0.026557 0.021875 1.214 0.2247
## GenreStrategy -0.163845 0.034113 -4.803 1.58e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8108 on 16586 degrees of freedom
## Multiple R-squared: 0.01493, Adjusted R-squared: 0.01428
## F-statistic: 22.86 on 11 and 16586 DF, p-value: < 2.2e-16
==> Hypothesis: There is no significant change in EU_Sales with respect to genre and platform.
[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]
fit <- lm( EU_Sales ~ Genre , data = vgsales)
summary(fit)
##
## Call:
## lm(formula = EU_Sales ~ Genre, data = vgsales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2391 -0.1409 -0.1064 -0.0299 28.8594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.158323 0.008736 18.123 < 2e-16 ***
## GenreAdventure -0.108455 0.016526 -6.563 5.44e-11 ***
## GenreFighting -0.038842 0.019358 -2.006 0.044821 *
## GenreMisc -0.034125 0.014894 -2.291 0.021966 *
## GenrePlatform 0.069250 0.019025 3.640 0.000273 ***
## GenrePuzzle -0.071072 0.022608 -3.144 0.001672 **
## GenreRacing 0.032541 0.016701 1.948 0.051380 .
## GenreRole-Playing -0.031939 0.015697 -2.035 0.041893 *
## GenreShooter 0.080814 0.016416 4.923 8.62e-07 ***
## GenreSimulation -0.027551 0.019189 -1.436 0.151088
## GenreSports 0.002312 0.013572 0.170 0.864742
## GenreStrategy -0.091745 0.021164 -4.335 1.47e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5031 on 16586 degrees of freedom
## Multiple R-squared: 0.00971, Adjusted R-squared: 0.009054
## F-statistic: 14.79 on 11 and 16586 DF, p-value: < 2.2e-16
==> Hypothesis: There is no significant change in JP_Sales with respect to genre and platform.
[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]
fit <- lm( JP_Sales ~ Genre , data = vgsales)
summary(fit)
##
## Call:
## lm(formula = JP_Sales ~ Genre, data = vgsales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2368 -0.0620 -0.0482 -0.0282 9.9832
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.048236 0.005282 9.132 < 2e-16 ***
## GenreAdventure -0.007746 0.009992 -0.775 0.438225
## GenreFighting 0.054771 0.011705 4.679 2.9e-06 ***
## GenreMisc 0.013731 0.009006 1.525 0.127352
## GenrePlatform 0.099360 0.011503 8.638 < 2e-16 ***
## GenrePuzzle 0.050235 0.013670 3.675 0.000239 ***
## GenreRacing -0.002848 0.010098 -0.282 0.777958
## GenreRole-Playing 0.188532 0.009491 19.865 < 2e-16 ***
## GenreShooter -0.019014 0.009926 -1.916 0.055427 .
## GenreSimulation 0.025236 0.011602 2.175 0.029635 *
## GenreSports 0.009467 0.008206 1.154 0.248659
## GenreStrategy 0.024393 0.012797 1.906 0.056643 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3042 on 16586 degrees of freedom
## Multiple R-squared: 0.03352, Adjusted R-squared: 0.03288
## F-statistic: 52.29 on 11 and 16586 DF, p-value: < 2.2e-16
==> Hypothesis: There is no significant change in Other_Sales with respect to genre and platform.
[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]
fit <- lm( Other_Sales ~ Genre , data = vgsales)
summary(fit)
##
## Call:
## lm(formula = Other_Sales ~ Genre, data = vgsales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0784 -0.0475 -0.0333 -0.0067 10.5135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.056508 0.003262 17.321 < 2e-16 ***
## GenreAdventure -0.043436 0.006172 -7.038 2.02e-12 ***
## GenreFighting -0.013253 0.007229 -1.833 0.066783 .
## GenreMisc -0.013196 0.005562 -2.372 0.017686 *
## GenrePlatform 0.001720 0.007105 0.242 0.808696
## GenrePuzzle -0.034944 0.008443 -4.139 3.51e-05 ***
## GenreRacing 0.005358 0.006237 0.859 0.390349
## GenreRole-Playing -0.016447 0.005862 -2.806 0.005025 **
## GenreShooter 0.021881 0.006131 3.569 0.000359 ***
## GenreSimulation -0.020153 0.007166 -2.812 0.004925 **
## GenreSports 0.001024 0.005068 0.202 0.839867
## GenreStrategy -0.039826 0.007904 -5.039 4.73e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1879 on 16586 degrees of freedom
## Multiple R-squared: 0.008315, Adjusted R-squared: 0.007657
## F-statistic: 12.64 on 11 and 16586 DF, p-value: < 2.2e-16
==> Hypothesis: There is no significant change in Global_Sales with respect to genre and platform.
[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]
fit <- lm( Global_Sales ~ Genre , data = vgsales)
summary(fit)
##
## Call:
## lm(formula = Global_Sales ~ Genre, data = vgsales)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.928 -0.458 -0.307 -0.037 82.173
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.528100 0.026851 19.668 < 2e-16 ***
## GenreAdventure -0.342221 0.050795 -6.737 1.67e-11 ***
## GenreFighting 0.001275 0.059501 0.021 0.9829
## GenreMisc -0.062338 0.045780 -1.362 0.1733
## GenrePlatform 0.410241 0.058476 7.016 2.38e-12 ***
## GenrePuzzle -0.107224 0.069491 -1.543 0.1229
## GenreRacing 0.058001 0.051334 1.130 0.2585
## GenreRole-Playing 0.095132 0.048247 1.972 0.0486 *
## GenreShooter 0.263785 0.050458 5.228 1.74e-07 ***
## GenreSimulation -0.075736 0.058980 -1.284 0.1991
## GenreSports 0.039219 0.041715 0.940 0.3471
## GenreStrategy -0.270949 0.065052 -4.165 3.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.546 on 16586 degrees of freedom
## Multiple R-squared: 0.01194, Adjusted R-squared: 0.01128
## F-statistic: 18.22 on 11 and 16586 DF, p-value: < 2.2e-16