list.files()
## [1] "Database Philadelphia-5 (1).xlsx" "LIA R Analysis.Rmd"
## [3] "LIA-R-Analysis.Rmd"
lia <- read_excel("Database Philadelphia-5 (1).xlsx")
## New names:
## • `` -> `...32`
## • `Phone:` -> `Phone:...37`
## • `Phone:` -> `Phone:...38`
head(lia)
## # A tibble: 6 × 39
## Member Gender Age Height Weight Ethnic Religious Active Marital
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 100357 F 29 69 165 NA P N N
## 2 100365 M 29 72 212 NA O Y N
## 3 100377 F 25 69 183 NA O Y N
## 4 100411 M 57 69 182 AA O Y S
## 5 100420 M 46 68 185 NA C N W
## 6 100422 F 37 69 160 NA C Y D
## # ℹ 30 more variables: `Number of Pets` <dbl>, Education <chr>,
## # Occupation <chr>, Income <dbl>, Political <chr>, Health <chr>,
## # Disabilities <chr>, Smoking <chr>, Alcohol <chr>, Communicable <chr>,
## # Criminal <chr>, Bicycling <chr>, Cooking <chr>, Dancing <chr>,
## # Gardening <chr>, Outdoor <chr>, Reading <chr>, Sports <chr>, Scuba <chr>,
## # Theater <chr>, Travel <chr>, `Name:` <chr>, ...32 <chr>,
## # `Street Address` <chr>, City <chr>, State <chr>, Zip <dbl>, …
names(lia)
## [1] "Member" "Gender" "Age" "Height"
## [5] "Weight" "Ethnic" "Religious" "Active"
## [9] "Marital" "Number of Pets" "Education" "Occupation"
## [13] "Income" "Political" "Health" "Disabilities"
## [17] "Smoking" "Alcohol" "Communicable" "Criminal"
## [21] "Bicycling" "Cooking" "Dancing" "Gardening"
## [25] "Outdoor" "Reading" "Sports" "Scuba"
## [29] "Theater" "Travel" "Name:" "...32"
## [33] "Street Address" "City" "State" "Zip"
## [37] "Phone:...37" "Phone:...38" "Secure Inc."
summary(lia$Income)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -204800 30933 43200 60126 60600 3725148
sd(lia$Income, na.rm = TRUE)
## [1] 138120.7
sum(is.na(lia$Income))
## [1] 0
quantile(lia$Income, probs = c(.25, .5, .75), na.rm = TRUE)
## 25% 50% 75%
## 30933.33 43200.00 60600.00
ggplot(lia, aes(x = Income)) +
geom_histogram(bins = 30)

ggplot(lia, aes(x = Income)) +
geom_histogram(binwidth = 10000) +
coord_cartesian(xlim = c(0, 200000))

ggplot(lia, aes(y = Income)) +
geom_boxplot()

head(lia[order(lia$Income), c("Member", "Age", "Occupation", "Education", "Income")], 10)
## # A tibble: 10 × 5
## Member Age Occupation Education Income
## <dbl> <dbl> <chr> <chr> <dbl>
## 1 106134 58 Prof A -204800
## 2 108741 53 Constr B -61700
## 3 103139 29 Labor B -29733.
## 4 104538 29 Cler H 9100
## 5 108516 26 AF S 9900
## 6 105249 27 Cler S 10300
## 7 107813 28 Serv B 11100
## 8 104150 29 AF S 12000
## 9 102850 26 Trans B 12300
## 10 103767 28 Mech H 12500
head(lia[order(-lia$Income), c("Member", "Age", "Occupation", "Education", "Income")], 10)
## # A tibble: 10 × 5
## Member Age Occupation Education Income
## <dbl> <dbl> <chr> <chr> <dbl>
## 1 101963 45 M&A B 3725148
## 2 105966 34 Prof A 1180040
## 3 106158 49 Ed A 999999
## 4 104760 59 Prof A 985900
## 5 109486 82 Mgmt S 800000
## 6 103458 59 Ed A 780000
## 7 105329 45 Prof A 565000
## 8 107664 45 Prof B 464200
## 9 103607 49 Labor H 319100
## 10 103247 57 Prof A 234400