#1 Data visualization and exploration tasks with gpa data set
data(gpa)
summary(gpa)
## gpa studyweek sleepnight out gender
## Min. :2.900 Min. : 2.00 Min. :5.000 Min. :0.000 female:43
## 1st Qu.:3.400 1st Qu.:10.00 1st Qu.:6.000 1st Qu.:1.250 male :12
## Median :3.650 Median :15.00 Median :7.000 Median :2.000
## Mean :3.600 Mean :19.15 Mean :7.064 Mean :2.109
## 3rd Qu.:3.825 3rd Qu.:26.50 3rd Qu.:8.000 3rd Qu.:3.000
## Max. :4.670 Max. :50.00 Max. :9.000 Max. :4.000
?gpa
##1.1 Precise meaning of each variable. - gpa: Student’s Grade Point Average - studyweek: Hours spent per week studying - sleepnight: Hours slept per night - out: Nights out per week - gender: The gender of the student
##1.2 studyweek vs gpa
ggplot(gpa, aes(x = studyweek, y = gpa)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "GPA vs Study Hours per Week",
x = "Study hours per week",
y = "GPA"
) +
theme_hw
The number of hour studied per week seems to only minorly increase the gpa. Theres even outliers where low amount of study time has a high gpa, and high amount of study time with low gpa.
##1.3 out vs gpa
ggplot(gpa, aes(x = out, y = gpa)) +
geom_jitter(width = 0.15, alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "GPA vs Nights Out per Week",
x = "Nights out per week",
y = "GPA"
) +
theme_hw
In general, the more nights out, the higher the gpa, but not by
much.
##1.4 out vs sleepnight
ggplot(gpa, aes(x = out, y = sleepnight)) +
geom_jitter(width = 0.15, alpha = 0.7) +
geom_smooth(method = "lm", se = TRUE) +
labs(
title = "Sleep per Night vs Nights Out per Week",
x = "Nights out per week",
y = "Hours of sleep per night"
) +
theme_hw
The more nights spent out, the more sleep gained per night.
##1.5 gender vs studyweek
ggplot(gpa, aes(x = gender, y = studyweek)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot(width = 0.18, outlier.alpha = 0.6) +
labs(
title = "Study Hours per Week by Gender",
x = "Gender",
y = "Study hours per week"
) +
theme_hw
The median of both genders are about the same for study hours per week,
but females have a higher limit for studying per week.
##1.6 gender vs out
ggplot(gpa, aes(x = gender, y = out)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot(width = 0.18, outlier.alpha = 0.6) +
labs(
title = "Nights Out per Week by Gender",
x = "Gender",
y = "Nights out per week"
) +
theme_hw
The upper limit for both genders are the same in the nights going out,
but boys go out more.
##1.7 What is the GPA of each gender?
ggplot(gpa, aes(x = gender, y = gpa)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot(width = 0.18, outlier.alpha = 0.6) +
labs(
title = "GPA by Gender",
x = "Gender",
y = "GPA"
) +
theme_hw
Using the previous graphs, because girls go out less at night and study more per week than the boys, females have a higher gpa in general.
#2. Data visualization tasks with loans_full_schema data set
data(loans_full_schema)
glimpse(loans_full_schema)
## Rows: 10,000
## Columns: 55
## $ emp_title <chr> "global config engineer ", "warehouse…
## $ emp_length <dbl> 3, 10, 3, 1, 10, NA, 10, 10, 10, 3, 1…
## $ state <fct> NJ, HI, WI, PA, CA, KY, MI, AZ, NV, I…
## $ homeownership <fct> MORTGAGE, RENT, RENT, RENT, RENT, OWN…
## $ annual_income <dbl> 90000, 40000, 40000, 30000, 35000, 34…
## $ verified_income <fct> Verified, Not Verified, Source Verifi…
## $ debt_to_income <dbl> 18.01, 5.04, 21.15, 10.16, 57.96, 6.4…
## $ annual_income_joint <dbl> NA, NA, NA, NA, 57000, NA, 155000, NA…
## $ verification_income_joint <fct> , , , , Verified, , Not Verified, , ,…
## $ debt_to_income_joint <dbl> NA, NA, NA, NA, 37.66, NA, 13.12, NA,…
## $ delinq_2y <int> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0…
## $ months_since_last_delinq <int> 38, NA, 28, NA, NA, 3, NA, 19, 18, NA…
## $ earliest_credit_line <dbl> 2001, 1996, 2006, 2007, 2008, 1990, 2…
## $ inquiries_last_12m <int> 6, 1, 4, 0, 7, 6, 1, 1, 3, 0, 4, 4, 8…
## $ total_credit_lines <int> 28, 30, 31, 4, 22, 32, 12, 30, 35, 9,…
## $ open_credit_lines <int> 10, 14, 10, 4, 16, 12, 10, 15, 21, 6,…
## $ total_credit_limit <int> 70795, 28800, 24193, 25400, 69839, 42…
## $ total_credit_utilized <int> 38767, 4321, 16000, 4997, 52722, 3898…
## $ num_collections_last_12m <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ num_historical_failed_to_pay <int> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ months_since_90d_late <int> 38, NA, 28, NA, NA, 60, NA, 71, 18, N…
## $ current_accounts_delinq <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ total_collection_amount_ever <int> 1250, 0, 432, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ current_installment_accounts <int> 2, 0, 1, 1, 1, 0, 2, 2, 6, 1, 2, 1, 2…
## $ accounts_opened_24m <int> 5, 11, 13, 1, 6, 2, 1, 4, 10, 5, 6, 7…
## $ months_since_last_credit_inquiry <int> 5, 8, 7, 15, 4, 5, 9, 7, 4, 17, 3, 4,…
## $ num_satisfactory_accounts <int> 10, 14, 10, 4, 16, 12, 10, 15, 21, 6,…
## $ num_accounts_120d_past_due <int> 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, …
## $ num_accounts_30d_past_due <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ num_active_debit_accounts <int> 2, 3, 3, 2, 10, 1, 3, 5, 11, 3, 2, 2,…
## $ total_debit_limit <int> 11100, 16500, 4300, 19400, 32700, 272…
## $ num_total_cc_accounts <int> 14, 24, 14, 3, 20, 27, 8, 16, 19, 7, …
## $ num_open_cc_accounts <int> 8, 14, 8, 3, 15, 12, 7, 12, 14, 5, 8,…
## $ num_cc_carrying_balance <int> 6, 4, 6, 2, 13, 5, 6, 10, 14, 3, 5, 3…
## $ num_mort_accounts <int> 1, 0, 0, 0, 0, 3, 2, 7, 2, 0, 2, 3, 3…
## $ account_never_delinq_percent <dbl> 92.9, 100.0, 93.5, 100.0, 100.0, 78.1…
## $ tax_liens <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ public_record_bankrupt <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ loan_purpose <fct> moving, debt_consolidation, other, de…
## $ application_type <fct> individual, individual, individual, i…
## $ loan_amount <int> 28000, 5000, 2000, 21600, 23000, 5000…
## $ term <dbl> 60, 36, 36, 36, 36, 36, 60, 60, 36, 3…
## $ interest_rate <dbl> 14.07, 12.61, 17.09, 6.72, 14.07, 6.7…
## $ installment <dbl> 652.53, 167.54, 71.40, 664.19, 786.87…
## $ grade <fct> C, C, D, A, C, A, C, B, C, A, C, B, C…
## $ sub_grade <fct> C3, C1, D1, A3, C3, A3, C2, B5, C2, A…
## $ issue_month <fct> Mar-2018, Feb-2018, Feb-2018, Jan-201…
## $ loan_status <fct> Current, Current, Current, Current, C…
## $ initial_listing_status <fct> whole, whole, fractional, whole, whol…
## $ disbursement_method <fct> Cash, Cash, Cash, Cash, Cash, Cash, C…
## $ balance <dbl> 27015.86, 4651.37, 1824.63, 18853.26,…
## $ paid_total <dbl> 1999.330, 499.120, 281.800, 3312.890,…
## $ paid_principal <dbl> 984.14, 348.63, 175.37, 2746.74, 1569…
## $ paid_interest <dbl> 1015.19, 150.49, 106.43, 566.15, 754.…
## $ paid_late_fees <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
##2.1 Histogram Density Curve
ggplot(loans_full_schema, aes(x = interest_rate)) +
geom_histogram(aes(y = after_stat(density)), bins = 35, boundary = 0, alpha = 0.7) +
geom_density(linewidth = 1) +
labs(
title = "Distribution of Interest Rates",
x = "Interest rate (%)",
y = "Density"
) +
theme_hw
The graph is skewed to the right, meaning most interest rates are
generally less that 15%.
##2.2 Categorical effect on a numeric distribution
loans_full_schema %>%
filter(!is.na(homeownership), !is.na(interest_rate)) %>%
ggplot(aes(x = homeownership, y = interest_rate)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot(width = 0.18, outlier.alpha = 0.5) +
labs(
title = "Interest Rate by Homeownership",
x = "Homeownership status",
y = "Interest rate (%)"
) +
theme_hw
The interest rates of mortgage, own, and rent are generally about the
same, with rent slightly higher than the mortgage and own. Own has a
higher median than mortgage,
##2.3 Bin heatmap for two numeric variables
loans_full_schema %>%
filter(!is.na(annual_income), !is.na(loan_amount)) %>%
ggplot(aes(x = annual_income, y = loan_amount)) +
geom_bin2d(bins = 35) +
scale_x_continuous(labels = dollar_format()) +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "2D Binned Heatmap: Annual Income vs Loan Amount",
x = "Annual income",
y = "Loan amount"
) +
theme_hw
In general, the lower the annual income, the higher the load amount
##2.4 facet_wrap
loans_full_schema %>%
filter(!is.na(loan_purpose), !is.na(loan_amount), !is.na(interest_rate)) %>%
ggplot(aes(x = loan_amount, y = interest_rate)) +
geom_point(alpha = 0.25) +
geom_smooth(method = "lm", se = FALSE) +
scale_x_continuous(labels = dollar_format()) +
labs(
title = "Interest Rate vs Loan Amount, Faceted by Loan Purpose",
x = "Loan amount",
y = "Interest rate (%)"
) +
facet_wrap(~ loan_purpose, ncol = 3) +
theme_hw
Lower interest rates gives out lower loan amounts, most loans are
generally credit cards or debt consolidation.
##2.5 facet_grid informative plot
loans_full_schema %>%
filter(!is.na(homeownership), !is.na(verified_income), !is.na(interest_rate)) %>%
ggplot(aes(x = interest_rate)) +
geom_histogram(bins = 30, boundary = 0, alpha = 0.7) +
labs(
title = "Interest Rate Distributions by Homeownership and Income Verification",
x = "Interest rate (%)",
y = "Count"
) +
facet_grid(verified_income ~ homeownership) +
theme_hw
Owning has the lowest interest rates, while mortgage has the highest
amount, but at the lower end of interst rate.
##2.6 Do borrowers with higher debt-to-income tend to have higher interest rates?
loans_full_schema %>%
filter(!is.na(debt_to_income), !is.na(interest_rate)) %>%
ggplot(aes(x = debt_to_income, y = interest_rate)) +
geom_point(alpha = 0.25) +
geom_smooth(method = "loess", se = TRUE) +
labs(
title = "Interest Rate vs Debt-to-Income Ratio",
x = "Debt-to-income ratio",
y = "Interest rate (%)"
) +
theme_hw
As debt to income (DTI) increases from low levels, interest rates rises.
But when DTI rises high, the relationship between DTI and interest rate
becomes less clear due to the low amount of data points.
#3 Data visualization and exploration tasks with ames data set
data(ames)
glimpse(ames)
## Rows: 2,930
## Columns: 82
## $ Order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ PID <int> 526301100, 526350040, 526351010, 526353030, 527105010,…
## $ area <int> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616, 1…
## $ price <int> 215000, 105000, 172000, 244000, 189900, 195500, 213500…
## $ MS.SubClass <int> 20, 20, 20, 20, 60, 60, 120, 120, 120, 60, 60, 20, 60,…
## $ MS.Zoning <fct> RL, RH, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL…
## $ Lot.Frontage <int> 141, 80, 81, 93, 74, 78, 41, 43, 39, 60, 75, NA, 63, 8…
## $ Lot.Area <int> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005, 5…
## $ Street <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, …
## $ Alley <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Lot.Shape <fct> IR1, Reg, IR1, Reg, IR1, IR1, Reg, IR1, IR1, Reg, IR1,…
## $ Land.Contour <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, HLS, Lvl, Lvl, Lvl,…
## $ Utilities <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub…
## $ Lot.Config <fct> Corner, Inside, Corner, Corner, Inside, Inside, Inside…
## $ Land.Slope <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl,…
## $ Neighborhood <fct> NAmes, NAmes, NAmes, NAmes, Gilbert, Gilbert, StoneBr,…
## $ Condition.1 <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, Norm, Norm,…
## $ Condition.2 <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, …
## $ Bldg.Type <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, TwnhsE, TwnhsE, Tw…
## $ House.Style <fct> 1Story, 1Story, 1Story, 1Story, 2Story, 2Story, 1Story…
## $ Overall.Qual <int> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, …
## $ Overall.Cond <int> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, …
## $ Year.Built <int> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ Year.Remod.Add <int> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996, …
## $ Roof.Style <fct> Hip, Gable, Hip, Hip, Gable, Gable, Gable, Gable, Gabl…
## $ Roof.Matl <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, …
## $ Exterior.1st <fct> BrkFace, VinylSd, Wd Sdng, BrkFace, VinylSd, VinylSd, …
## $ Exterior.2nd <fct> Plywood, VinylSd, Wd Sdng, BrkFace, VinylSd, VinylSd, …
## $ Mas.Vnr.Type <fct> Stone, None, BrkFace, None, None, BrkFace, None, None,…
## $ Mas.Vnr.Area <int> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603,…
## $ Exter.Qual <fct> TA, TA, TA, Gd, TA, TA, Gd, Gd, Gd, TA, TA, TA, TA, TA…
## $ Exter.Cond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, Gd, TA, TA…
## $ Foundation <fct> CBlock, CBlock, CBlock, CBlock, PConc, PConc, PConc, P…
## $ Bsmt.Qual <fct> TA, TA, TA, TA, Gd, TA, Gd, Gd, Gd, TA, Gd, Gd, Gd, Gd…
## $ Bsmt.Cond <fct> Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Bsmt.Exposure <fct> Gd, No, No, No, No, No, Mn, No, No, No, No, No, No, Gd…
## $ BsmtFin.Type.1 <fct> BLQ, Rec, ALQ, ALQ, GLQ, GLQ, GLQ, ALQ, GLQ, Unf, Unf,…
## $ BsmtFin.SF.1 <int> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0, 9…
## $ BsmtFin.Type.2 <fct> Unf, LwQ, Unf, Unf, Unf, Unf, Unf, Unf, Unf, Unf, Unf,…
## $ BsmtFin.SF.2 <int> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0…
## $ Bsmt.Unf.SF <int> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, 76…
## $ Total.Bsmt.SF <int> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 994…
## $ Heating <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, …
## $ Heating.QC <fct> Fa, TA, TA, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Gd, Ex, Gd, Gd…
## $ Central.Air <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
## $ Electrical <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr…
## $ X1st.Flr.SF <int> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 102…
## $ X2nd.Flr.SF <int> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 0, 676, 0, 0,…
## $ Low.Qual.Fin.SF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Bsmt.Full.Bath <int> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, …
## $ Bsmt.Half.Bath <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Full.Bath <int> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, …
## $ Half.Bath <int> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ Bedroom.AbvGr <int> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 1, 4, 4, 1, …
## $ Kitchen.AbvGr <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Kitchen.Qual <fct> TA, TA, Gd, Ex, TA, Gd, Gd, Gd, Gd, Gd, TA, TA, TA, Gd…
## $ TotRms.AbvGrd <int> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 6, 7, 5, 4, 12, 8, 8,…
## $ Functional <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ,…
## $ Fireplaces <int> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ Fireplace.Qu <fct> Gd, NA, NA, TA, TA, Gd, NA, NA, TA, TA, TA, NA, Gd, Po…
## $ Garage.Type <fct> Attchd, Attchd, Attchd, Attchd, Attchd, Attchd, Attchd…
## $ Garage.Yr.Blt <int> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ Garage.Finish <fct> Fin, Unf, Unf, Fin, Fin, Fin, Fin, RFn, RFn, Fin, Fin,…
## $ Garage.Cars <int> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ Garage.Area <int> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 440,…
## $ Garage.Qual <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Garage.Cond <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Paved.Drive <fct> P, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
## $ Wood.Deck.SF <int> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 483, …
## $ Open.Porch.SF <int> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 21, 75, 0, 5…
## $ Enclosed.Porch <int> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ X3Ssn.Porch <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Screen.Porch <int> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 140, 210…
## $ Pool.Area <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Pool.QC <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Fence <fct> NA, MnPrv, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, GdPr…
## $ Misc.Feature <fct> NA, NA, Gar2, NA, NA, NA, NA, NA, NA, NA, NA, Shed, NA…
## $ Misc.Val <int> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0, …
## $ Mo.Sold <int> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 3, 5, 2, 6, 6, 6, 6, …
## $ Yr.Sold <int> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
## $ Sale.Type <fct> WD , WD , WD , WD , WD , WD , WD , WD , WD , WD , WD ,…
## $ Sale.Condition <fct> Normal, Normal, Normal, Normal, Normal, Normal, Normal…
##3.1 Area vs price
ggplot(ames, aes(x = area, y = price)) +
geom_point(alpha = 0.25) +
geom_smooth(method = "lm", se = TRUE) +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Sale Price vs Area",
x = "Area (square feet)",
y = "Sale price"
) +
theme_hw
Generally, the trend is the higher the area, the higher the sales price
in an almost linear relationship.
##3.2 Bldg.Type vs price
ames %>%
filter(!is.na(Bldg.Type), !is.na(price)) %>%
ggplot(aes(x = Bldg.Type, y = price)) +
stat_boxplot(geom = "errorbar", width = 0.5) +
geom_boxplot(width = 0.18, outlier.alpha = 0.4) +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Sale Price by Building Type",
x = "Building type (Bldg.Type)",
y = "Sale price"
) +
theme_hw
- 1Fam: Single-family detached - 2FmCon: Two-family conversion
(originally single-family, converted to two-family) - Duplx: Duplex -
TwnhsE: Townhouse end unit - Twnhs: Townhouse inside unit
All the homes tend to have around the same median price, but single familiy homes have the widest range and the most amount of outliers, showing that their price varies greatly compared to the other homes.
##3.3 Bldg.Type and area together vs price
ggplot(ames, aes(x = area, y = price, color = Bldg.Type)) +
geom_point(alpha = 0.35) +
geom_smooth(method = "lm", se = FALSE) +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Sale Price vs Area, Colored by Building Type",
x = "Area (square feet)",
y = "Sale price",
color = "Bldg.Type"
) +
theme_hw
For all the different home types, they all increase in price as as their area increase, some increaseing more in price compared to the other types of homes.
##3.4 Area & Year
ggplot(ames, aes(x = area, y = price, color = Year.Built)) +
geom_point(alpha = 0.35) +
geom_smooth(method = "lm", se = FALSE, color = "black") +
scale_y_continuous(labels = dollar_format()) +
labs(
title = "Sale Price vs Area (Color = Year Built)",
x = "Area (square feet)",
y = "Sale price",
color = "Year built"
) +
theme_hw
It seems that the homes that were built closer to the modern times are more expensive that those that were built in the past.
##3.5 Do newer homes tend to be larger?
ggplot(ames, aes(x = Year.Built, y = area)) +
geom_point(alpha = 0.25) +
geom_smooth(method = "loess", se = TRUE) +
labs(
title = "Area vs Year Built",
x = "Year built",
y = "Area (square feet)"
) +
theme_hw
Yes, on average, the newer the home, the larger it will be compared to the past.