vibhu.dagar2016@vitstudent.ac.in

VIT Vellore

==> Dataset for analyzing video game sales:

vgsales <- read.csv("C:/Program Files/RStudio/files/vgsales.csv")
View(vgsales)
summary(vgsales)
##       Rank                                Name          Platform   
##  Min.   :    1   Need for Speed: Most Wanted:   12   DS     :2163  
##  1st Qu.: 4151   FIFA 14                    :    9   PS2    :2161  
##  Median : 8300   LEGO Marvel Super Heroes   :    9   PS3    :1329  
##  Mean   : 8301   Madden NFL 07              :    9   Wii    :1325  
##  3rd Qu.:12450   Ratatouille                :    9   X360   :1265  
##  Max.   :16600   Angry Birds Star Wars      :    8   PSP    :1213  
##                  (Other)                    :16542   (Other):7142  
##       Year               Genre                             Publisher    
##  2009   :1431   Action      :3316   Electronic Arts             : 1351  
##  2008   :1428   Sports      :2346   Activision                  :  975  
##  2010   :1259   Misc        :1739   Namco Bandai Games          :  932  
##  2007   :1202   Role-Playing:1488   Ubisoft                     :  921  
##  2011   :1139   Shooter     :1310   Konami Digital Entertainment:  832  
##  2006   :1008   Adventure   :1286   THQ                         :  715  
##  (Other):9131   (Other)     :5113   (Other)                     :10872  
##     NA_Sales          EU_Sales          JP_Sales         Other_Sales      
##  Min.   : 0.0000   Min.   : 0.0000   Min.   : 0.00000   Min.   : 0.00000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.: 0.00000   1st Qu.: 0.00000  
##  Median : 0.0800   Median : 0.0200   Median : 0.00000   Median : 0.01000  
##  Mean   : 0.2647   Mean   : 0.1467   Mean   : 0.07778   Mean   : 0.04806  
##  3rd Qu.: 0.2400   3rd Qu.: 0.1100   3rd Qu.: 0.04000   3rd Qu.: 0.04000  
##  Max.   :41.4900   Max.   :29.0200   Max.   :10.22000   Max.   :10.57000  
##                                                                           
##   Global_Sales    
##  Min.   : 0.0100  
##  1st Qu.: 0.0600  
##  Median : 0.1700  
##  Mean   : 0.5374  
##  3rd Qu.: 0.4700  
##  Max.   :82.7400  
## 
library(psych)
describe(vgsales)
##              vars     n    mean      sd  median trimmed     mad  min
## Rank            1 16598 8300.61 4791.85 8300.50 8300.56 6152.05 1.00
## Name*           2 16598 5795.86 3324.01 5864.50 5810.22 4270.63 1.00
## Platform*       3 16598   16.71    8.29   17.00   16.67   10.38 1.00
## Year*           4 16598   27.61    6.00   28.00   28.00    5.93 1.00
## Genre*          5 16598    5.93    3.76    6.00    5.86    5.93 1.00
## Publisher*      6 16598  299.40  181.98  329.00  303.97  272.80 1.00
## NA_Sales        7 16598    0.26    0.82    0.08    0.13    0.12 0.00
## EU_Sales        8 16598    0.15    0.51    0.02    0.06    0.03 0.00
## JP_Sales        9 16598    0.08    0.31    0.00    0.02    0.00 0.00
## Other_Sales    10 16598    0.05    0.19    0.01    0.02    0.01 0.00
## Global_Sales   11 16598    0.54    1.56    0.17    0.27    0.21 0.01
##                   max    range  skew kurtosis    se
## Rank         16600.00 16599.00  0.00    -1.20 37.19
## Name*        11493.00 11492.00 -0.03    -1.21 25.80
## Platform*       31.00    30.00 -0.05    -1.00  0.06
## Year*           40.00    39.00 -0.86     1.68  0.05
## Genre*          12.00    11.00  0.07    -1.43  0.03
## Publisher*     579.00   578.00 -0.15    -1.40  1.41
## NA_Sales        41.49    41.49 18.80   648.86  0.01
## EU_Sales        29.02    29.02 18.87   755.71  0.00
## JP_Sales        10.22    10.22 11.20   194.15  0.00
## Other_Sales     10.57    10.57 24.23  1024.92  0.00
## Global_Sales    82.74    82.73 17.40   603.68  0.01

Visualizing the Dataset

==> Yearly Production of video games:

plot(vgsales$Year, xlab = "Years", ylab = "No. of games",main="Yearly Distribution of Video Games" ,col="cyan")

==> Distribution of games with respect to consoles:

plot(vgsales$Platform, main = "Platform based distribution", xlab = "Platform", ylab = "No. of games", col="cyan")

==> Distribution of games with respect to genres:

plot(vgsales$Genre, main = "Genre based distribution", xlab = "Genre", ylab = "No. of games", col="cyan")

[ Here we see that the Action genre is the most popular amongst all ]

==> Distribution of games with respect to publishers:

plot(vgsales$Publisher, main = "Publisher based distribution", xlab = "Publisher", ylab = "No. of games")

==> Sales of games with respect to the platforms in North America:

plot(vgsales$Platform ~ vgsales$NA_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")

[ The above graph shows that xbox sales are maximum in North America ]

==> Sales of games with respect to the platforms in Europe:

plot(vgsales$Platform ~ vgsales$EU_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")

[ The above graph shows that xbox sales are maximum in Europe ]

==> Sales of games with respect to the platforms in Japan:

plot(vgsales$Platform ~ vgsales$JP_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")

[ The above graph shows that xbox sales are maximum in Japan ]

==> Sales of games with respect to the platforms Globaly:

plot(vgsales$Platform ~ vgsales$Global_Sales, main = "Sales based distribution", xlab="", ylab = "No. of games with respect to platforms", col="cyan")

[ The above graph shows that xbox sales are maximum Globaly ]

==> Contingency tables regarding sales in different regions:

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix(formula = ~ NA_Sales + EU_Sales + JP_Sales , cex=1, data=vgsales ,reg.line=""  )

==> Distribution of sales in diferent regions:

min(vgsales$NA_Sales)
## [1] 0
max(vgsales$NA_Sales)
## [1] 41.49
median(vgsales$NA_Sales)
## [1] 0.08
min(vgsales$JP_Sales)
## [1] 0
max(vgsales$JP_Sales)
## [1] 10.22
median(vgsales$JP_Sales)
## [1] 0
min(vgsales$EU_Sales)
## [1] 0
max(vgsales$EU_Sales)
## [1] 29.02
median(vgsales$EU_Sales)
## [1] 0.02
min(vgsales$Other_Sales)
## [1] 0
max(vgsales$Other_Sales)
## [1] 10.57
median(vgsales$Other_Sales)
## [1] 0.01
min(vgsales$Global_Sales)
## [1] 0.01
max(vgsales$Global_Sales)
## [1] 82.74
median(vgsales$Global_Sales)
## [1] 0.17

==> Corrgram representing various disrtributions in the dataset:

library(corrgram)
corrgram ( vgsales , order = TRUE , lower.panel = panel.shade , upper.panel = panel.pie , text.panel = panel.txt , main = " corrgram for the distributions in the dataset ")

==> North American Sales distribution with respect to each year:

scatter.smooth(vgsales$Year, vgsales$NA_Sales,xlab="Year",ylab="NA_Sales",main="Year vs NA_Sales")

==> European Sales distribution with respect to each year:

scatter.smooth(vgsales$Year, vgsales$EU_Sales,xlab="Year",ylab="EU_Sales",main="Year vs EU_Sales")

==> Japanese Sales distribution with respect to each year:

scatter.smooth(vgsales$Year, vgsales$JP_Sales,xlab="Year",ylab="JP_Sales",main="Year vs JP_Sales")

==> Global Sales distribution with respect to each year:

scatter.smooth(vgsales$Year, vgsales$Global_Sales,xlab="Year",ylab="Global_Sales",main="Year vs Global_Sales")

==> Hypothesis: There is no significant change in NA_Sales with respect to genre and platform.

[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]

fit <- lm( NA_Sales ~ Genre , data = vgsales)
summary(fit)
## 
## Call:
## lm(formula = NA_Sales ~ Genre, data = vgsales)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.505 -0.236 -0.153 -0.015 41.199 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.264726   0.014081  18.801  < 2e-16 ***
## GenreAdventure    -0.182455   0.026636  -6.850 7.65e-12 ***
## GenreFighting     -0.001058   0.031202  -0.034   0.9729    
## GenreMisc         -0.028820   0.024007  -1.200   0.2300    
## GenrePlatform      0.239846   0.030664   7.822 5.53e-15 ***
## GenrePuzzle       -0.052045   0.036440  -1.428   0.1532    
## GenreRacing        0.023041   0.026919   0.856   0.3921    
## GenreRole-Playing -0.044779   0.025300  -1.770   0.0768 .  
## GenreShooter       0.180007   0.026460   6.803 1.06e-11 ***
## GenreSimulation   -0.053295   0.030928  -1.723   0.0849 .  
## GenreSports        0.026557   0.021875   1.214   0.2247    
## GenreStrategy     -0.163845   0.034113  -4.803 1.58e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8108 on 16586 degrees of freedom
## Multiple R-squared:  0.01493,    Adjusted R-squared:  0.01428 
## F-statistic: 22.86 on 11 and 16586 DF,  p-value: < 2.2e-16

==> Hypothesis: There is no significant change in EU_Sales with respect to genre and platform.

[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]

fit <- lm( EU_Sales ~ Genre , data = vgsales)
summary(fit)
## 
## Call:
## lm(formula = EU_Sales ~ Genre, data = vgsales)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2391 -0.1409 -0.1064 -0.0299 28.8594 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.158323   0.008736  18.123  < 2e-16 ***
## GenreAdventure    -0.108455   0.016526  -6.563 5.44e-11 ***
## GenreFighting     -0.038842   0.019358  -2.006 0.044821 *  
## GenreMisc         -0.034125   0.014894  -2.291 0.021966 *  
## GenrePlatform      0.069250   0.019025   3.640 0.000273 ***
## GenrePuzzle       -0.071072   0.022608  -3.144 0.001672 ** 
## GenreRacing        0.032541   0.016701   1.948 0.051380 .  
## GenreRole-Playing -0.031939   0.015697  -2.035 0.041893 *  
## GenreShooter       0.080814   0.016416   4.923 8.62e-07 ***
## GenreSimulation   -0.027551   0.019189  -1.436 0.151088    
## GenreSports        0.002312   0.013572   0.170 0.864742    
## GenreStrategy     -0.091745   0.021164  -4.335 1.47e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5031 on 16586 degrees of freedom
## Multiple R-squared:  0.00971,    Adjusted R-squared:  0.009054 
## F-statistic: 14.79 on 11 and 16586 DF,  p-value: < 2.2e-16

==> Hypothesis: There is no significant change in JP_Sales with respect to genre and platform.

[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]

fit <- lm( JP_Sales ~ Genre , data = vgsales)
summary(fit)
## 
## Call:
## lm(formula = JP_Sales ~ Genre, data = vgsales)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2368 -0.0620 -0.0482 -0.0282  9.9832 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.048236   0.005282   9.132  < 2e-16 ***
## GenreAdventure    -0.007746   0.009992  -0.775 0.438225    
## GenreFighting      0.054771   0.011705   4.679  2.9e-06 ***
## GenreMisc          0.013731   0.009006   1.525 0.127352    
## GenrePlatform      0.099360   0.011503   8.638  < 2e-16 ***
## GenrePuzzle        0.050235   0.013670   3.675 0.000239 ***
## GenreRacing       -0.002848   0.010098  -0.282 0.777958    
## GenreRole-Playing  0.188532   0.009491  19.865  < 2e-16 ***
## GenreShooter      -0.019014   0.009926  -1.916 0.055427 .  
## GenreSimulation    0.025236   0.011602   2.175 0.029635 *  
## GenreSports        0.009467   0.008206   1.154 0.248659    
## GenreStrategy      0.024393   0.012797   1.906 0.056643 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3042 on 16586 degrees of freedom
## Multiple R-squared:  0.03352,    Adjusted R-squared:  0.03288 
## F-statistic: 52.29 on 11 and 16586 DF,  p-value: < 2.2e-16

==> Hypothesis: There is no significant change in Other_Sales with respect to genre and platform.

[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]

fit <- lm( Other_Sales ~ Genre , data = vgsales)
summary(fit)
## 
## Call:
## lm(formula = Other_Sales ~ Genre, data = vgsales)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.0784 -0.0475 -0.0333 -0.0067 10.5135 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.056508   0.003262  17.321  < 2e-16 ***
## GenreAdventure    -0.043436   0.006172  -7.038 2.02e-12 ***
## GenreFighting     -0.013253   0.007229  -1.833 0.066783 .  
## GenreMisc         -0.013196   0.005562  -2.372 0.017686 *  
## GenrePlatform      0.001720   0.007105   0.242 0.808696    
## GenrePuzzle       -0.034944   0.008443  -4.139 3.51e-05 ***
## GenreRacing        0.005358   0.006237   0.859 0.390349    
## GenreRole-Playing -0.016447   0.005862  -2.806 0.005025 ** 
## GenreShooter       0.021881   0.006131   3.569 0.000359 ***
## GenreSimulation   -0.020153   0.007166  -2.812 0.004925 ** 
## GenreSports        0.001024   0.005068   0.202 0.839867    
## GenreStrategy     -0.039826   0.007904  -5.039 4.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1879 on 16586 degrees of freedom
## Multiple R-squared:  0.008315,   Adjusted R-squared:  0.007657 
## F-statistic: 12.64 on 11 and 16586 DF,  p-value: < 2.2e-16

==> Hypothesis: There is no significant change in Global_Sales with respect to genre and platform.

[ In the following data the ones having p-value<0.05 do not have a significant change but the rest change the sales significantly.]

fit <- lm( Global_Sales ~ Genre , data = vgsales)
summary(fit)
## 
## Call:
## lm(formula = Global_Sales ~ Genre, data = vgsales)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -0.928 -0.458 -0.307 -0.037 82.173 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.528100   0.026851  19.668  < 2e-16 ***
## GenreAdventure    -0.342221   0.050795  -6.737 1.67e-11 ***
## GenreFighting      0.001275   0.059501   0.021   0.9829    
## GenreMisc         -0.062338   0.045780  -1.362   0.1733    
## GenrePlatform      0.410241   0.058476   7.016 2.38e-12 ***
## GenrePuzzle       -0.107224   0.069491  -1.543   0.1229    
## GenreRacing        0.058001   0.051334   1.130   0.2585    
## GenreRole-Playing  0.095132   0.048247   1.972   0.0486 *  
## GenreShooter       0.263785   0.050458   5.228 1.74e-07 ***
## GenreSimulation   -0.075736   0.058980  -1.284   0.1991    
## GenreSports        0.039219   0.041715   0.940   0.3471    
## GenreStrategy     -0.270949   0.065052  -4.165 3.13e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.546 on 16586 degrees of freedom
## Multiple R-squared:  0.01194,    Adjusted R-squared:  0.01128 
## F-statistic: 18.22 on 11 and 16586 DF,  p-value: < 2.2e-16