#1 Data visualization and exploration tasks with gpa data set

data(gpa)
summary(gpa)
##       gpa          studyweek       sleepnight         out           gender  
##  Min.   :2.900   Min.   : 2.00   Min.   :5.000   Min.   :0.000   female:43  
##  1st Qu.:3.400   1st Qu.:10.00   1st Qu.:6.000   1st Qu.:1.250   male  :12  
##  Median :3.650   Median :15.00   Median :7.000   Median :2.000              
##  Mean   :3.600   Mean   :19.15   Mean   :7.064   Mean   :2.109              
##  3rd Qu.:3.825   3rd Qu.:26.50   3rd Qu.:8.000   3rd Qu.:3.000              
##  Max.   :4.670   Max.   :50.00   Max.   :9.000   Max.   :4.000
?gpa

##1.1 Precise meaning of each variable. - gpa: Student’s Grade Point Average - studyweek: Hours spent per week studying - sleepnight: Hours slept per night - out: Nights out per week - gender: The gender of the student

##1.2 studyweek vs gpa

ggplot(gpa, aes(x = studyweek, y = gpa)) +
  geom_point(alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "GPA vs Study Hours per Week",
    x = "Study hours per week",
    y = "GPA"
  ) +
  theme_hw

The number of hour studied per week seems to only minorly increase the gpa. Theres even outliers where low amount of study time has a high gpa, and high amount of study time with low gpa.

##1.3 out vs gpa

ggplot(gpa, aes(x = out, y = gpa)) +
  geom_jitter(width = 0.15, alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "GPA vs Nights Out per Week",
    x = "Nights out per week",
    y = "GPA"
  ) +
  theme_hw

In general, the more nights out, the higher the gpa, but not by much.

##1.4 out vs sleepnight

ggplot(gpa, aes(x = out, y = sleepnight)) +
  geom_jitter(width = 0.15, alpha = 0.7) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(
    title = "Sleep per Night vs Nights Out per Week",
    x = "Nights out per week",
    y = "Hours of sleep per night"
  ) +
  theme_hw

The more nights spent out, the more sleep gained per night.

##1.5 gender vs studyweek

ggplot(gpa, aes(x = gender, y = studyweek)) +
  stat_boxplot(geom = "errorbar", width = 0.5) +
  geom_boxplot(width = 0.18, outlier.alpha = 0.6) +
  labs(
    title = "Study Hours per Week by Gender",
    x = "Gender",
    y = "Study hours per week"
  ) +
  theme_hw

The median of both genders are about the same for study hours per week, but females have a higher limit for studying per week.

##1.6 gender vs out

ggplot(gpa, aes(x = gender, y = out)) +
  stat_boxplot(geom = "errorbar", width = 0.5) +
  geom_boxplot(width = 0.18, outlier.alpha = 0.6) +
  labs(
    title = "Nights Out per Week by Gender",
    x = "Gender",
    y = "Nights out per week"
  ) +
  theme_hw

The upper limit for both genders are the same in the nights going out, but boys go out more.

##1.7 What is the GPA of each gender?

ggplot(gpa, aes(x = gender, y = gpa)) +
  stat_boxplot(geom = "errorbar", width = 0.5) +
  geom_boxplot(width = 0.18, outlier.alpha = 0.6) +
  labs(
    title = "GPA by Gender",
    x = "Gender",
    y = "GPA"
  ) +
  theme_hw

Using the previous graphs, because girls go out less at night and study more per week than the boys, females have a higher gpa in general.

#2. Data visualization tasks with loans_full_schema data set

data(loans_full_schema)
glimpse(loans_full_schema)
## Rows: 10,000
## Columns: 55
## $ emp_title                        <chr> "global config engineer ", "warehouse…
## $ emp_length                       <dbl> 3, 10, 3, 1, 10, NA, 10, 10, 10, 3, 1…
## $ state                            <fct> NJ, HI, WI, PA, CA, KY, MI, AZ, NV, I…
## $ homeownership                    <fct> MORTGAGE, RENT, RENT, RENT, RENT, OWN…
## $ annual_income                    <dbl> 90000, 40000, 40000, 30000, 35000, 34…
## $ verified_income                  <fct> Verified, Not Verified, Source Verifi…
## $ debt_to_income                   <dbl> 18.01, 5.04, 21.15, 10.16, 57.96, 6.4…
## $ annual_income_joint              <dbl> NA, NA, NA, NA, 57000, NA, 155000, NA…
## $ verification_income_joint        <fct> , , , , Verified, , Not Verified, , ,…
## $ debt_to_income_joint             <dbl> NA, NA, NA, NA, 37.66, NA, 13.12, NA,…
## $ delinq_2y                        <int> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0…
## $ months_since_last_delinq         <int> 38, NA, 28, NA, NA, 3, NA, 19, 18, NA…
## $ earliest_credit_line             <dbl> 2001, 1996, 2006, 2007, 2008, 1990, 2…
## $ inquiries_last_12m               <int> 6, 1, 4, 0, 7, 6, 1, 1, 3, 0, 4, 4, 8…
## $ total_credit_lines               <int> 28, 30, 31, 4, 22, 32, 12, 30, 35, 9,…
## $ open_credit_lines                <int> 10, 14, 10, 4, 16, 12, 10, 15, 21, 6,…
## $ total_credit_limit               <int> 70795, 28800, 24193, 25400, 69839, 42…
## $ total_credit_utilized            <int> 38767, 4321, 16000, 4997, 52722, 3898…
## $ num_collections_last_12m         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ num_historical_failed_to_pay     <int> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ months_since_90d_late            <int> 38, NA, 28, NA, NA, 60, NA, 71, 18, N…
## $ current_accounts_delinq          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ total_collection_amount_ever     <int> 1250, 0, 432, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ current_installment_accounts     <int> 2, 0, 1, 1, 1, 0, 2, 2, 6, 1, 2, 1, 2…
## $ accounts_opened_24m              <int> 5, 11, 13, 1, 6, 2, 1, 4, 10, 5, 6, 7…
## $ months_since_last_credit_inquiry <int> 5, 8, 7, 15, 4, 5, 9, 7, 4, 17, 3, 4,…
## $ num_satisfactory_accounts        <int> 10, 14, 10, 4, 16, 12, 10, 15, 21, 6,…
## $ num_accounts_120d_past_due       <int> 0, 0, 0, 0, 0, 0, 0, NA, 0, 0, 0, 0, …
## $ num_accounts_30d_past_due        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ num_active_debit_accounts        <int> 2, 3, 3, 2, 10, 1, 3, 5, 11, 3, 2, 2,…
## $ total_debit_limit                <int> 11100, 16500, 4300, 19400, 32700, 272…
## $ num_total_cc_accounts            <int> 14, 24, 14, 3, 20, 27, 8, 16, 19, 7, …
## $ num_open_cc_accounts             <int> 8, 14, 8, 3, 15, 12, 7, 12, 14, 5, 8,…
## $ num_cc_carrying_balance          <int> 6, 4, 6, 2, 13, 5, 6, 10, 14, 3, 5, 3…
## $ num_mort_accounts                <int> 1, 0, 0, 0, 0, 3, 2, 7, 2, 0, 2, 3, 3…
## $ account_never_delinq_percent     <dbl> 92.9, 100.0, 93.5, 100.0, 100.0, 78.1…
## $ tax_liens                        <int> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ public_record_bankrupt           <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ loan_purpose                     <fct> moving, debt_consolidation, other, de…
## $ application_type                 <fct> individual, individual, individual, i…
## $ loan_amount                      <int> 28000, 5000, 2000, 21600, 23000, 5000…
## $ term                             <dbl> 60, 36, 36, 36, 36, 36, 60, 60, 36, 3…
## $ interest_rate                    <dbl> 14.07, 12.61, 17.09, 6.72, 14.07, 6.7…
## $ installment                      <dbl> 652.53, 167.54, 71.40, 664.19, 786.87…
## $ grade                            <fct> C, C, D, A, C, A, C, B, C, A, C, B, C…
## $ sub_grade                        <fct> C3, C1, D1, A3, C3, A3, C2, B5, C2, A…
## $ issue_month                      <fct> Mar-2018, Feb-2018, Feb-2018, Jan-201…
## $ loan_status                      <fct> Current, Current, Current, Current, C…
## $ initial_listing_status           <fct> whole, whole, fractional, whole, whol…
## $ disbursement_method              <fct> Cash, Cash, Cash, Cash, Cash, Cash, C…
## $ balance                          <dbl> 27015.86, 4651.37, 1824.63, 18853.26,…
## $ paid_total                       <dbl> 1999.330, 499.120, 281.800, 3312.890,…
## $ paid_principal                   <dbl> 984.14, 348.63, 175.37, 2746.74, 1569…
## $ paid_interest                    <dbl> 1015.19, 150.49, 106.43, 566.15, 754.…
## $ paid_late_fees                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…

##2.1 Histogram Density Curve

ggplot(loans_full_schema, aes(x = interest_rate)) +
  geom_histogram(aes(y = after_stat(density)), bins = 35, boundary = 0, alpha = 0.7) +
  geom_density(linewidth = 1) +
  labs(
    title = "Distribution of Interest Rates",
    x = "Interest rate (%)",
    y = "Density"
  ) +
  theme_hw

The graph is skewed to the right, meaning most interest rates are generally less that 15%.

##2.2 Categorical effect on a numeric distribution

loans_full_schema %>%
  filter(!is.na(homeownership), !is.na(interest_rate)) %>%
  ggplot(aes(x = homeownership, y = interest_rate)) +
  stat_boxplot(geom = "errorbar", width = 0.5) +
  geom_boxplot(width = 0.18, outlier.alpha = 0.5) +
  labs(
    title = "Interest Rate by Homeownership",
    x = "Homeownership status",
    y = "Interest rate (%)"
  ) +
  theme_hw

The interest rates of mortgage, own, and rent are generally about the same, with rent slightly higher than the mortgage and own. Own has a higher median than mortgage,

##2.3 Bin heatmap for two numeric variables

loans_full_schema %>%
  filter(!is.na(annual_income), !is.na(loan_amount)) %>%
  ggplot(aes(x = annual_income, y = loan_amount)) +
  geom_bin2d(bins = 35) +
  scale_x_continuous(labels = dollar_format()) +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    title = "2D Binned Heatmap: Annual Income vs Loan Amount",
    x = "Annual income",
    y = "Loan amount"
  ) +
  theme_hw

In general, the lower the annual income, the higher the load amount

##2.4 facet_wrap

loans_full_schema %>%
  filter(!is.na(loan_purpose), !is.na(loan_amount), !is.na(interest_rate)) %>%
  ggplot(aes(x = loan_amount, y = interest_rate)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_x_continuous(labels = dollar_format()) +
  labs(
    title = "Interest Rate vs Loan Amount, Faceted by Loan Purpose",
    x = "Loan amount",
    y = "Interest rate (%)"
  ) +
  facet_wrap(~ loan_purpose, ncol = 3) +
  theme_hw

Lower interest rates gives out lower loan amounts, most loans are generally credit cards or debt consolidation.

##2.5 facet_grid informative plot

loans_full_schema %>%
  filter(!is.na(homeownership), !is.na(verified_income), !is.na(interest_rate)) %>%
  ggplot(aes(x = interest_rate)) +
  geom_histogram(bins = 30, boundary = 0, alpha = 0.7) +
  labs(
    title = "Interest Rate Distributions by Homeownership and Income Verification",
    x = "Interest rate (%)",
    y = "Count"
  ) +
  facet_grid(verified_income ~ homeownership) +
  theme_hw

Owning has the lowest interest rates, while mortgage has the highest amount, but at the lower end of interst rate.

##2.6 Do borrowers with higher debt-to-income tend to have higher interest rates?

loans_full_schema %>%
  filter(!is.na(debt_to_income), !is.na(interest_rate)) %>%
  ggplot(aes(x = debt_to_income, y = interest_rate)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(
    title = "Interest Rate vs Debt-to-Income Ratio",
    x = "Debt-to-income ratio",
    y = "Interest rate (%)"
  ) +
  theme_hw

As debt to income (DTI) increases from low levels, interest rates rises. But when DTI rises high, the relationship between DTI and interest rate becomes less clear due to the low amount of data points.

#3 Data visualization and exploration tasks with ames data set

data(ames)
glimpse(ames)
## Rows: 2,930
## Columns: 82
## $ Order           <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ PID             <int> 526301100, 526350040, 526351010, 526353030, 527105010,…
## $ area            <int> 1656, 896, 1329, 2110, 1629, 1604, 1338, 1280, 1616, 1…
## $ price           <int> 215000, 105000, 172000, 244000, 189900, 195500, 213500…
## $ MS.SubClass     <int> 20, 20, 20, 20, 60, 60, 120, 120, 120, 60, 60, 20, 60,…
## $ MS.Zoning       <fct> RL, RH, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL, RL…
## $ Lot.Frontage    <int> 141, 80, 81, 93, 74, 78, 41, 43, 39, 60, 75, NA, 63, 8…
## $ Lot.Area        <int> 31770, 11622, 14267, 11160, 13830, 9978, 4920, 5005, 5…
## $ Street          <fct> Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, Pave, …
## $ Alley           <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Lot.Shape       <fct> IR1, Reg, IR1, Reg, IR1, IR1, Reg, IR1, IR1, Reg, IR1,…
## $ Land.Contour    <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, HLS, Lvl, Lvl, Lvl,…
## $ Utilities       <fct> AllPub, AllPub, AllPub, AllPub, AllPub, AllPub, AllPub…
## $ Lot.Config      <fct> Corner, Inside, Corner, Corner, Inside, Inside, Inside…
## $ Land.Slope      <fct> Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl, Gtl,…
## $ Neighborhood    <fct> NAmes, NAmes, NAmes, NAmes, Gilbert, Gilbert, StoneBr,…
## $ Condition.1     <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, Norm, Norm,…
## $ Condition.2     <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, …
## $ Bldg.Type       <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, TwnhsE, TwnhsE, Tw…
## $ House.Style     <fct> 1Story, 1Story, 1Story, 1Story, 2Story, 2Story, 1Story…
## $ Overall.Qual    <int> 6, 5, 6, 7, 5, 6, 8, 8, 8, 7, 6, 6, 6, 7, 8, 8, 8, 9, …
## $ Overall.Cond    <int> 5, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 7, 2, …
## $ Year.Built      <int> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ Year.Remod.Add  <int> 1960, 1961, 1958, 1968, 1998, 1998, 2001, 1992, 1996, …
## $ Roof.Style      <fct> Hip, Gable, Hip, Hip, Gable, Gable, Gable, Gable, Gabl…
## $ Roof.Matl       <fct> CompShg, CompShg, CompShg, CompShg, CompShg, CompShg, …
## $ Exterior.1st    <fct> BrkFace, VinylSd, Wd Sdng, BrkFace, VinylSd, VinylSd, …
## $ Exterior.2nd    <fct> Plywood, VinylSd, Wd Sdng, BrkFace, VinylSd, VinylSd, …
## $ Mas.Vnr.Type    <fct> Stone, None, BrkFace, None, None, BrkFace, None, None,…
## $ Mas.Vnr.Area    <int> 112, 0, 108, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603,…
## $ Exter.Qual      <fct> TA, TA, TA, Gd, TA, TA, Gd, Gd, Gd, TA, TA, TA, TA, TA…
## $ Exter.Cond      <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, Gd, TA, TA…
## $ Foundation      <fct> CBlock, CBlock, CBlock, CBlock, PConc, PConc, PConc, P…
## $ Bsmt.Qual       <fct> TA, TA, TA, TA, Gd, TA, Gd, Gd, Gd, TA, Gd, Gd, Gd, Gd…
## $ Bsmt.Cond       <fct> Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Bsmt.Exposure   <fct> Gd, No, No, No, No, No, Mn, No, No, No, No, No, No, Gd…
## $ BsmtFin.Type.1  <fct> BLQ, Rec, ALQ, ALQ, GLQ, GLQ, GLQ, ALQ, GLQ, Unf, Unf,…
## $ BsmtFin.SF.1    <int> 639, 468, 923, 1065, 791, 602, 616, 263, 1180, 0, 0, 9…
## $ BsmtFin.Type.2  <fct> Unf, LwQ, Unf, Unf, Unf, Unf, Unf, Unf, Unf, Unf, Unf,…
## $ BsmtFin.SF.2    <int> 0, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1120, 0, 0…
## $ Bsmt.Unf.SF     <int> 441, 270, 406, 1045, 137, 324, 722, 1017, 415, 994, 76…
## $ Total.Bsmt.SF   <int> 1080, 882, 1329, 2110, 928, 926, 1338, 1280, 1595, 994…
## $ Heating         <fct> GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, GasA, …
## $ Heating.QC      <fct> Fa, TA, TA, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Gd, Ex, Gd, Gd…
## $ Central.Air     <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
## $ Electrical      <fct> SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr, SBrkr…
## $ X1st.Flr.SF     <int> 1656, 896, 1329, 2110, 928, 926, 1338, 1280, 1616, 102…
## $ X2nd.Flr.SF     <int> 0, 0, 0, 0, 701, 678, 0, 0, 0, 776, 892, 0, 676, 0, 0,…
## $ Low.Qual.Fin.SF <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Bsmt.Full.Bath  <int> 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, …
## $ Bsmt.Half.Bath  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Full.Bath       <int> 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 3, 2, 1, …
## $ Half.Bath       <int> 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, …
## $ Bedroom.AbvGr   <int> 3, 2, 3, 3, 3, 3, 2, 2, 2, 3, 3, 3, 3, 2, 1, 4, 4, 1, …
## $ Kitchen.AbvGr   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Kitchen.Qual    <fct> TA, TA, Gd, Ex, TA, Gd, Gd, Gd, Gd, Gd, TA, TA, TA, Gd…
## $ TotRms.AbvGrd   <int> 7, 5, 6, 8, 6, 7, 6, 5, 5, 7, 7, 6, 7, 5, 4, 12, 8, 8,…
## $ Functional      <fct> Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ, Typ,…
## $ Fireplaces      <int> 2, 0, 0, 2, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ Fireplace.Qu    <fct> Gd, NA, NA, TA, TA, Gd, NA, NA, TA, TA, TA, NA, Gd, Po…
## $ Garage.Type     <fct> Attchd, Attchd, Attchd, Attchd, Attchd, Attchd, Attchd…
## $ Garage.Yr.Blt   <int> 1960, 1961, 1958, 1968, 1997, 1998, 2001, 1992, 1995, …
## $ Garage.Finish   <fct> Fin, Unf, Unf, Fin, Fin, Fin, Fin, RFn, RFn, Fin, Fin,…
## $ Garage.Cars     <int> 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, …
## $ Garage.Area     <int> 528, 730, 312, 522, 482, 470, 582, 506, 608, 442, 440,…
## $ Garage.Qual     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Garage.Cond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
## $ Paved.Drive     <fct> P, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
## $ Wood.Deck.SF    <int> 210, 140, 393, 0, 212, 360, 0, 0, 237, 140, 157, 483, …
## $ Open.Porch.SF   <int> 62, 0, 36, 0, 34, 36, 0, 82, 152, 60, 84, 21, 75, 0, 5…
## $ Enclosed.Porch  <int> 0, 0, 0, 0, 0, 0, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ X3Ssn.Porch     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Screen.Porch    <int> 0, 120, 0, 0, 0, 0, 0, 144, 0, 0, 0, 0, 0, 0, 140, 210…
## $ Pool.Area       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Pool.QC         <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ Fence           <fct> NA, MnPrv, NA, NA, MnPrv, NA, NA, NA, NA, NA, NA, GdPr…
## $ Misc.Feature    <fct> NA, NA, Gar2, NA, NA, NA, NA, NA, NA, NA, NA, Shed, NA…
## $ Misc.Val        <int> 0, 0, 12500, 0, 0, 0, 0, 0, 0, 0, 0, 500, 0, 0, 0, 0, …
## $ Mo.Sold         <int> 5, 6, 6, 4, 3, 6, 4, 1, 3, 6, 4, 3, 5, 2, 6, 6, 6, 6, …
## $ Yr.Sold         <int> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, …
## $ Sale.Type       <fct> WD , WD , WD , WD , WD , WD , WD , WD , WD , WD , WD ,…
## $ Sale.Condition  <fct> Normal, Normal, Normal, Normal, Normal, Normal, Normal…

##3.1 Area vs price

ggplot(ames, aes(x = area, y = price)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "lm", se = TRUE) +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    title = "Sale Price vs Area",
    x = "Area (square feet)",
    y = "Sale price"
  ) +
  theme_hw

Generally, the trend is the higher the area, the higher the sales price in an almost linear relationship.

##3.2 Bldg.Type vs price

ames %>%
  filter(!is.na(Bldg.Type), !is.na(price)) %>%
  ggplot(aes(x = Bldg.Type, y = price)) +
  stat_boxplot(geom = "errorbar", width = 0.5) +
  geom_boxplot(width = 0.18, outlier.alpha = 0.4) +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    title = "Sale Price by Building Type",
    x = "Building type (Bldg.Type)",
    y = "Sale price"
  ) +
  theme_hw

- 1Fam: Single-family detached - 2FmCon: Two-family conversion (originally single-family, converted to two-family) - Duplx: Duplex - TwnhsE: Townhouse end unit - Twnhs: Townhouse inside unit

All the homes tend to have around the same median price, but single familiy homes have the widest range and the most amount of outliers, showing that their price varies greatly compared to the other homes.

##3.3 Bldg.Type and area together vs price

ggplot(ames, aes(x = area, y = price, color = Bldg.Type)) +
  geom_point(alpha = 0.35) +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    title = "Sale Price vs Area, Colored by Building Type",
    x = "Area (square feet)",
    y = "Sale price",
    color = "Bldg.Type"
  ) +
  theme_hw

For all the different home types, they all increase in price as as their area increase, some increaseing more in price compared to the other types of homes.

##3.4 Area & Year

ggplot(ames, aes(x = area, y = price, color = Year.Built)) +
  geom_point(alpha = 0.35) +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    title = "Sale Price vs Area (Color = Year Built)",
    x = "Area (square feet)",
    y = "Sale price",
    color = "Year built"
  ) +
  theme_hw

It seems that the homes that were built closer to the modern times are more expensive that those that were built in the past.

##3.5 Do newer homes tend to be larger?

ggplot(ames, aes(x = Year.Built, y = area)) +
  geom_point(alpha = 0.25) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(
    title = "Area vs Year Built",
    x = "Year built",
    y = "Area (square feet)"
  ) +
  theme_hw

Yes, on average, the newer the home, the larger it will be compared to the past.