library(tidyverse)
library(rio)
library(janitor)
library(skimr)
library(kableExtra)
library(corrplot)RFinalTest
1 Introduction
In this project we will analyze real-world data on individuals annual cafe expenditures. The demographics in the provided data set contains information (age, sex, education), location indicators (Urban vs. non-urban), as well as neighborhood median income by the ZIP code. The goal of these project will be to explore with factors are associated with higher spending while evaluating how demographic and regional income can predict expenditure spending.
2 Libraries Used
3 Explanation of Libraries
3.1 tidyverse
A collection of R Packages (like dplyr or ggplot) designed for cleaning/wrangling data and transforming it to be easily translatable in a visual sense.
3.2 rio
A package designed for importing or exporting different formats of data.
3.3 janitor
Tools for quickly organizing data especially for column names that are difficult to work with.
3.4 skimr
Easy and readable summaries of data showcasing distributions, missing values, and basic stats.
3.5 kableExtra
Enhances tables created with kable so that they look nicer in HTML/Word/PDF reports using borders, colors, alignments, and other features.
3.6 corrplot
Creates correlation plots to help visualize relationships between numeric variables.
4 Data
4.1 Loading Data
DataCafeOrg = import("ProjectDataCafeNew.csv") |>
clean_names("upper_camel")
DataZIPIncomeOrg = import("DataZIPIncomeNew-1.csv") |>
clean_names("upper_camel")|>
rename(IncomeZIP = Income)4.2 Selecting Variables
DataCafe = DataCafeOrg |>
select(CafeExp, Age, SexMale,UrbanYes, YearsAfterHighschool, Zip)
kable(head(DataCafe))| CafeExp | Age | SexMale | UrbanYes | YearsAfterHighschool | Zip |
|---|---|---|---|---|---|
| 75.51034 | 18.71554 | 1 | 1 | 0.7259538 | 19104 |
| 85.95790 | 49.96018 | 0 | 0 | 32.2933391 | 19105 |
| 91.17824 | 39.24468 | 0 | 0 | 21.4324698 | 19105 |
| 88.10348 | 26.32100 | 1 | 1 | 8.3337170 | 19104 |
| 128.30616 | 59.77155 | 1 | 1 | 41.9278606 | 19102 |
| 97.90179 | 28.56631 | 0 | 1 | 10.8204892 | 19104 |
4.3 Joining Tables
DataCafeWithInc = left_join(
DataCafe, DataZIPIncomeOrg,
join_by(Zip == Zip)
)
kable(head(DataCafeWithInc))| CafeExp | Age | SexMale | UrbanYes | YearsAfterHighschool | Zip | IncomeZIP |
|---|---|---|---|---|---|---|
| 75.51034 | 18.71554 | 1 | 1 | 0.7259538 | 19104 | 80 |
| 85.95790 | 49.96018 | 0 | 0 | 32.2933391 | 19105 | 70 |
| 91.17824 | 39.24468 | 0 | 0 | 21.4324698 | 19105 | 70 |
| 88.10348 | 26.32100 | 1 | 1 | 8.3337170 | 19104 | 80 |
| 128.30616 | 59.77155 | 1 | 1 | 41.9278606 | 19102 | 90 |
| 97.90179 | 28.56631 | 0 | 1 | 10.8204892 | 19104 | 80 |
5 Descriptive Statistics
5.1 N, Min, Max, Mean, SD, etc.
DataCafeNum = DataCafeWithInc |>
select(-Zip)
n_total = nrow(DataCafeNum)
DescrStatc = skim(DataCafeNum) |>
mutate (
N = n_total - n_missing
) |>
select(
Variable = skim_variable,
Missing = n_missing,
N,
Mean = numeric.mean,
SD = numeric.sd,
Min = numeric.p0,
Max = numeric.p100
)
kable(DescrStatc)| Variable | Missing | N | Mean | SD | Min | Max |
|---|---|---|---|---|---|---|
| CafeExp | 0 | 1600 | 100.388262 | 26.5729174 | 1.7821390 | 183.74515 |
| Age | 0 | 1600 | 37.069247 | 10.5231428 | 18.0206520 | 59.91429 |
| SexMale | 0 | 1600 | 0.444375 | 0.4970516 | 0.0000000 | 1.00000 |
| UrbanYes | 0 | 1600 | 0.695625 | 0.4602861 | 0.0000000 | 1.00000 |
| YearsAfterHighschool | 0 | 1600 | 19.572534 | 10.5278207 | 0.0895441 | 42.69667 |
| IncomeZIP | 0 | 1600 | 79.943750 | 14.4893253 | 50.0000000 | 110.00000 |
5.2 Group and Summarize (Pivot Table)
CafePivot = DataCafeWithInc |>
group_by(UrbanYes, SexMale ) |>
summarise (
N= n(),
MeanCafeExp = mean(CafeExp),
SDCafeExp = sd(CafeExp)
)
kable(CafePivot)| UrbanYes | SexMale | N | MeanCafeExp | SDCafeExp |
|---|---|---|---|---|
| 0 | 0 | 487 | 81.49312 | 23.57402 |
| 1 | 0 | 402 | 103.50931 | 23.05510 |
| 1 | 1 | 711 | 111.56586 | 23.11128 |
5.2.1 Average Price by ZIP Code
TableAvgCafeZip = DataCafeWithInc |>
group_by(Zip) |>
summarise(
N = n(),
MeanCafeExp = mean(CafeExp),
SDCafeExp = sd(CafeExp)
)
kable(TableAvgCafeZip)| Zip | N | MeanCafeExp | SDCafeExp |
|---|---|---|---|
| 19101 | 157 | 79.95523 | 24.13239 |
| 19102 | 230 | 111.69190 | 20.53935 |
| 19103 | 92 | 69.05188 | 25.33880 |
| 19104 | 631 | 103.28528 | 22.98335 |
| 19105 | 238 | 87.31683 | 20.34787 |
| 19106 | 167 | 118.35366 | 23.44256 |
| 19107 | 85 | 121.25712 | 21.52090 |
5.2.2 Average Price by ZIP Code & UrbanYes
TableAvgCafeZipUrban = DataCafeWithInc |>
group_by(Zip, UrbanYes) |>
summarise(
N = n(),
MeanCafeExp = mean(CafeExp),
SDCafeExp = sd(CafeExp)
)
kable(TableAvgCafeZipUrban)| Zip | UrbanYes | N | MeanCafeExp | SDCafeExp |
|---|---|---|---|---|
| 19101 | 0 | 157 | 79.95523 | 24.13239 |
| 19102 | 1 | 230 | 111.69190 | 20.53935 |
| 19103 | 0 | 92 | 69.05188 | 25.33880 |
| 19104 | 1 | 631 | 103.28528 | 22.98335 |
| 19105 | 0 | 238 | 87.31683 | 20.34787 |
| 19106 | 1 | 167 | 118.35366 | 23.44256 |
| 19107 | 1 | 85 | 121.25712 | 21.52090 |
7 Linear Regression
7.1 All Predictors
ModelLMAll = lm(CafeExp ~ ., data = DataCafeNum)
summary(ModelLMAll)
Call:
lm(formula = CafeExp ~ ., data = DataCafeNum)
Residuals:
Min 1Q Median 3Q Max
-89.530 -14.665 0.587 15.154 66.153
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -22.38384 34.91855 -0.641 0.522
Age 3.05243 1.96592 1.553 0.121
SexMale -0.40002 1.61633 -0.247 0.805
UrbanYes 10.00672 1.93032 5.184 2.45e-07 ***
YearsAfterHighschool -2.81462 1.96510 -1.432 0.152
IncomeZIP 0.72460 0.07064 10.257 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 22.38 on 1594 degrees of freedom
Multiple R-squared: 0.2928, Adjusted R-squared: 0.2906
F-statistic: 132 on 5 and 1594 DF, p-value: < 2.2e-16
7.2 Selected Predictors
ModelLMSelect = lm(CafeExp ~ UrbanYes + IncomeZIP, data = DataCafeNum)
summary(ModelLMSelect)
Call:
lm(formula = CafeExp ~ UrbanYes + IncomeZIP, data = DataCafeNum)
Residuals:
Min 1Q Median 3Q Max
-87.396 -15.034 0.825 14.715 66.001
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.20036 3.99762 9.055 < 2e-16 ***
UrbanYes 9.64866 1.93139 4.996 6.51e-07 ***
IncomeZIP 0.71896 0.06136 11.718 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 22.51 on 1597 degrees of freedom
Multiple R-squared: 0.283, Adjusted R-squared: 0.2821
F-statistic: 315.2 on 2 and 1597 DF, p-value: < 2.2e-16
8 Summary
CafeExp = 36.2 + 9.65(UrbanYes) + 0.72(IncomeZIP)
The average CafeExpenature is estimated to be around 36.2 dollars, while clieants who are in urban areas are more likely to spend 9.65 dollars more on average than those in non urban areas while for every additional 1000 dollars a IncomeZip Increases the the cafe spending by an average of 72 cents.