COMBINED ASSIGNMENT DATA ANALYTICS & R

TEAM MEMBERS

Alexandra Aedo

Umar Farooq

Ryutei Kaguragi

Adiza Ojei

Nivedita Venkatramanan

install.packages(“ggcorrplot”)

library(readxl)

library(ggplot2)

library(plotly)

library(ggcorrplot)

Extracting the dataset from excel

Final_Assignment_datasets_marketing_campaign_SF <- read_excel("C:/Users/Adiza Ojei/Desktop/2021 Hard drive Docs/HULT ONLINE CLASS/Data Analytics/Final Assignment _datasets_marketing_campaign_SF.xlsx")

mktg_data <- Final_Assignment_datasets_marketing_campaign_SF  ##saving the data to a dataframe
mktg_data <- as.data.frame(mktg_data) 

Q1: What factors seem to drive web purchases?

Plotting Income vs Number of Web Purchases taking into account Education

## for Income, we have removed one extreme outlier of Income value 666k 
chart1 <- ggplot(data=mktg_data, aes(x=Income,y=NumWebPurchases, color=Education)) + geom_point() + xlab("Income") + ylab("Number of Web Purchases") + xlim(1,170000)
ggplotly(chart1)

Plotting age vs Number of Web Purchases taking into account Number of kids at home

### Creating a new variable called age as we have the information Year of Birth
mktg_data$age <- c()
mktg_data$age <- 2021 - mktg_data$Year_Birth ## computing the present age 

chart2 <- ggplot(data=mktg_data, aes(x=age,y=NumWebPurchases, color=factor(Kidhome))) + geom_point() + xlab("Age") + ylab("Number of Web Purchases")
ggplotly(chart2)

Analysis of WebPurchases vs Education shows that PhDs and Graduates do maximum webpurchases

chart3 <- ggplot(data = mktg_data, aes(x = Education, y = NumWebPurchases, fill = Education)) + 
  geom_col(position = position_dodge()) 

chart3

Analysis of WebPurchases vs Marital Status shows that maximum Web Purchases come from Married Graduates and Single PhDs

chart4 <- ggplot(data = mktg_data, aes(x = Marital_Status, y = NumWebPurchases, fill = Education)) +
  geom_col(position = position_dodge()) 

chart4 

Q2: Is there a relation between web visits and web purchases?

webvisit_vs_webpurchase <- lm(NumWebPurchases~NumWebVisitsMonth,data=mktg_data)
summary(webvisit_vs_webpurchase) ## extracting the summary
## 
## Call:
## lm(formula = NumWebPurchases ~ NumWebVisitsMonth, data = mktg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4248 -2.0411 -0.3609  1.7670 22.6391 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.42481    0.14123  31.330   <2e-16 ***
## NumWebVisitsMonth -0.06395    0.02417  -2.646   0.0082 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.775 on 2238 degrees of freedom
## Multiple R-squared:  0.003119,   Adjusted R-squared:  0.002673 
## F-statistic: 7.002 on 1 and 2238 DF,  p-value: 0.0082
webvisit_vs_webpurchase$coefficients ## extracting the coefficients
##       (Intercept) NumWebVisitsMonth 
##        4.42480627       -0.06394878

Scatter plot of Number of Web Visits to Web Purchses

chart5 <- ggplot(data=mktg_data, aes(x=NumWebVisitsMonth,y=NumWebPurchases)) + geom_point() + xlab("Web Visits") + ylab("Web Purchase")
ggplotly(chart5)

**Interpretation:

Q3: Is there a relation between geographical region and the success of a campaign?

webvisit_vs_country <- lm(NumWebPurchases~Country, data=mktg_data)
summary(webvisit_vs_country) ## extracting the summary
## 
## Call:
## lm(formula = NumWebPurchases ~ Country, data = mktg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4404 -2.0018 -0.4404  1.9125 22.9982 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.08750    0.21970  18.605   <2e-16 ***
## CountryCA    0.21847    0.27764   0.787    0.431    
## CountryGER  -0.11250    0.33559  -0.335    0.737    
## CountryIND  -0.14155    0.31693  -0.447    0.655    
## CountryME    1.91250    1.61941   1.181    0.238    
## CountrySA    0.05790    0.26680   0.217    0.828    
## CountrySP   -0.08567    0.23520  -0.364    0.716    
## CountryUS    0.35287    0.34513   1.022    0.307    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.779 on 2232 degrees of freedom
## Multiple R-squared:  0.002948,   Adjusted R-squared:  -0.0001785 
## F-statistic: 0.9429 on 7 and 2232 DF,  p-value: 0.4719
webvisit_vs_country$coefficients ## extracting the coefficients
## (Intercept)   CountryCA  CountryGER  CountryIND   CountryME   CountrySA 
##  4.08750000  0.21847015 -0.11250000 -0.14155405  1.91250000  0.05790059 
##   CountrySP   CountryUS 
## -0.08567352  0.35286697

ggplotly(chart5b, type = “1”, lwd = 2)

chart5b <- ggplot(data=mktg_data, aes(x=Country,y=NumWebPurchases)) + geom_point() + xlab("Country") + ylab("Web Purchase")

chart5b

Interpretation: The geographical region is not statistically significant to the success of the campaign

Q4. Average amount spent on fruits products in the last two years is $26.36

avg_amount_fruits <- mean(mktg_data$MntFruits)
avg_amount_fruits
## [1] 26.30223

Q5.Is the variation in the amount spent on fish and and meat affected by any qualitative factors? If yes, which ones?

### Complain is a qualitative factor that would affects the variation in the amount spent of Fish and Meat product but there is no significant relation between complain and purchases.

fishNmeat_vs_compain <- lm(Complain~MntMeatProducts+MntFishProducts,data=mktg_data)
summary(fishNmeat_vs_compain)
## 
## Call:
## lm(formula = Complain ~ MntMeatProducts + MntFishProducts, data = mktg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.01133 -0.01112 -0.01056 -0.00813  0.99372 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.134e-02  2.618e-03   4.330 1.55e-05 ***
## MntMeatProducts -7.301e-06  1.097e-05  -0.666    0.506    
## MntFishProducts -1.982e-05  4.533e-05  -0.437    0.662    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0964 on 2237 degrees of freedom
## Multiple R-squared:  0.0006369,  Adjusted R-squared:  -0.0002566 
## F-statistic: 0.7128 on 2 and 2237 DF,  p-value: 0.4904

Q6: Fish has Omega 3 fatty acids, good for brain, accordingly, do people with advanced degrees purchase more fish than others?

## Representation by a circular bar and pie-chart 
chart6 <- ggplot(mktg_data, aes(x = Education, y = MntFishProducts, fill = Education)) +
  geom_col() + coord_polar("y", start = 0)

chart6

** The maximum number Fish purchases come from people who are Graduates**

Education v Fish Products bar chart

chart7 <- ggplot(mktg_data, aes(x = Education, y = MntFishProducts, fill = Education)) +
  geom_col() 

chart7

Q7. Teenagers are fussy on food, which foods do families with teenagers spend most on?

##Prepare a data frame for Teenhome, Type, Value

df_bar <- data.frame(Teenhome=mktg_data$Teenhome,type="MntFishProducts",value=mktg_data$MntFishProducts)
df_bar <- rbind(df_bar,data.frame(Teenhome=mktg_data$Teenhome,type="MntMeatProducts",value=mktg_data$MntMeatProducts))
df_bar <- rbind(df_bar,data.frame(Teenhome=mktg_data$Teenhome,type="MntFruits",value=mktg_data$MntFruits))
df_bar <- rbind(df_bar,data.frame(Teenhome=mktg_data$Teenhome,type="MntSweetProducts",value=mktg_data$MntSweetProducts))
df_bar <- rbind(df_bar,data.frame(Teenhome=mktg_data$Teenhome,type="MntWines",value=mktg_data$MntWines))
df_bar <- rbind(df_bar,data.frame(Teenhome=mktg_data$Teenhome,type="MntGoldProds",value=mktg_data$MntGoldProds))
# Analysis by ratio
chart8 <- ggplot(df_bar, aes(x = Teenhome, y =value, fill=type)) + geom_bar(stat = "identity", position="fill")

chart8

Q8. Which marketing campaign is most successful?

df_bar2 <- data.frame(AcceptedCmp=c(sum(mktg_data$AcceptedCmp1),
                                    sum(mktg_data$AcceptedCmp2),
                                    sum(mktg_data$AcceptedCmp3),
                                    sum(mktg_data$AcceptedCmp4)),
                      name=c("AcceptedCmp1",
                             "AcceptedCmp2",
                             "AcceptedCmp3",
                             "AcceptedCmp4"))

ggplot(df_bar2, aes(x=name, y=AcceptedCmp, fill=name))+ geom_bar(stat = "identity")

Q9. What kind of customers are complaining?

### Extracting data of those who complain
complaint <- mktg_data[which(mktg_data$Complain==1),] 
chart8 <- ggplot(complaint, aes(x="", y=Education, fill=Education)) +
  geom_bar(stat="identity", width=1) +
  coord_polar("y", start=0) + labs(title = "Pie-chart of complaining customers")

chart8

Q10. Average age of customers in the given sample is 52 years

summary(mktg_data$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   44.00   51.00   52.19   62.00  128.00
### Computing correlation coefficients and p-values between variables
cor.test(mktg_data$Income, mktg_data$NumWebPurchases)
## 
##  Pearson's product-moment correlation
## 
## data:  mktg_data$Income and mktg_data$NumWebPurchases
## t = 19.801, df = 2214, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3519222 0.4226905
## sample estimates:
##       cor 
## 0.3878778
cor.test(mktg_data$age, mktg_data$NumWebPurchases)
## 
##  Pearson's product-moment correlation
## 
## data:  mktg_data$age and mktg_data$NumWebPurchases
## t = 6.9348, df = 2238, p-value = 5.303e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1042505 0.1853426
## sample estimates:
##       cor 
## 0.1450401
cor.test(mktg_data$NumWebVisitsMonth, mktg_data$NumWebPurchases)
## 
##  Pearson's product-moment correlation
## 
## data:  mktg_data$NumWebVisitsMonth and mktg_data$NumWebPurchases
## t = -2.6461, df = 2238, p-value = 0.0082
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09703774 -0.01446393
## sample estimates:
##         cor 
## -0.05584633
ggcorrplot(corr_matrix1)