Reading Data

dataset <- read.csv('Social_Network_Ads.csv')

Describing Data

library(psych)
## Warning: package 'psych' was built under R version 3.4.3
describe(dataset)
##                 vars   n        mean       sd   median     trimmed
## User.ID            1 515 15689466.94 71282.69 15692819 15688984.03
## Gender*            2 515        1.50     0.50        1        1.50
## Age                3 515       37.60    10.38       37       37.36
## EstimatedSalary    4 515    68100.97 34416.90    65000    65464.89
## Purchased          5 515        0.37     0.48        0        0.33
##                      mad      min      max  range skew kurtosis      se
## User.ID         91417.12 15566689 15815236 248547 0.01    -1.18 3141.09
## Gender*             0.00        1        2      1 0.01    -2.00    0.02
## Age                11.86       18       60     42 0.17    -0.68    0.46
## EstimatedSalary 32617.20    15000   150000 135000 0.54    -0.42 1516.59
## Purchased           0.00        0        1      1 0.56    -1.69    0.02

One Way Contingency Table

mytable <- xtabs(~ Gender+Purchased, data=dataset)
mytable
##         Purchased
## Gender     0   1
##   Female 159 100
##   Male   168  88

BoxPlot

boxplot(Purchased ~ EstimatedSalary , data=dataset)

boxplot(Purchased ~ Age , data=dataset)

Histograms

a <- dataset$Purchased
hist( a , data = dataset,
           main = "Distrution of Purcahsed", xlab="Purchased or Not", col='grey' )
## Warning in plot.window(xlim, ylim, "", ...): "data" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data" is not a graphical parameter
## Warning in axis(1, ...): "data" is not a graphical parameter
## Warning in axis(2, ...): "data" is not a graphical parameter

b <- dataset$Age
hist( b , data = dataset, main = "Distrution of Age", xlab="Different Ages", col='blue' )
## Warning in plot.window(xlim, ylim, "", ...): "data" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data" is not a graphical parameter
## Warning in axis(1, ...): "data" is not a graphical parameter
## Warning in axis(2, ...): "data" is not a graphical parameter

c <- dataset$EstimatedSalary
hist( c , data = dataset,
           main = "Distrution of EstimatedSalaries", xlab="Different Salaries ", col='green' )
## Warning in plot.window(xlim, ylim, "", ...): "data" is not a graphical
## parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data" is not a graphical parameter
## Warning in axis(1, ...): "data" is not a graphical parameter
## Warning in axis(2, ...): "data" is not a graphical parameter

Correlation matrix

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(dataset, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="MBA Starting Salaries")

Correlation matrix using corrgram

library(corpcor)
library(tseries)
## Warning: package 'tseries' was built under R version 3.4.3
data_mat <- as.matrix(dataset[,3:5])
covmat = cov(data_mat)
cov2cor(covmat)
##                       Age EstimatedSalary Purchased
## Age             1.0000000       0.1241452 0.6387109
## EstimatedSalary 0.1241452       1.0000000 0.2954804
## Purchased       0.6387109       0.2954804 1.0000000

ScatterPlot Matrix

library(car)
## Warning: package 'car' was built under R version 3.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplotMatrix( formula = ~ Gender   + Age   + EstimatedSalary   + Purchased , cex = 0.6 , data = dataset)

Null Hypothesis

mytable <- xtabs(~ Purchased + Gender, data=dataset)
mytable
##          Gender
## Purchased Female Male
##         0    159  168
##         1    100   88

chisq test

chisq.test(mytable)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  mytable
## X-squared = 0.82189, df = 1, p-value = 0.3646

t-test

t.test( Purchased ~ Gender , data = dataset)
## 
##  Welch Two Sample t-test
## 
## data:  Purchased by Gender
## t = 0.99728, df = 512.91, p-value = 0.3191
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.04107827  0.12577905
## sample estimates:
## mean in group Female   mean in group Male 
##            0.3861004            0.3437500