library(lessR)
## 
## lessR 4.4.1                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
## 
##     sort_by
home = Read("https://www.lock5stat.com/datasets3e/HomesForSale.csv")
## 
## >>> Suggestions
## Recommended binary format for data files: feather
##   Create with Write(d, "your_file", format="feather")
## To read a csv or Excel file of variable labelsvar_labels=TRUE
##   Each row of the file:  Variable Name, Variable Label
## Read into a data frame named l  (the letter el)
## 
## More details about your data, Enter:  details()  for d, or  details(name)
## 
## Data Types
## ------------------------------------------------------------
## character: Non-numeric data values
## integer: Numeric data values, integers only
## double: Numeric data values with decimal digits
## ------------------------------------------------------------
## 
##     Variable                  Missing  Unique 
##         Name     Type  Values  Values  Values   First and last values
## ------------------------------------------------------------------------------------------
##  1     State character    120       0       4   CA  CA  CA ... PA  PA  PA
##  2     Price   integer    120       0      98   533  610  899 ... 90  228  109
##  3      Size   integer    120       0     111   1589  2008  2380 ... 1768  1732  1770
##  4      Beds   integer    120       0       6   3  3  5 ... 3  3  3
##  5     Baths    double    120       0       8   2.5  2  3 ... 2  4  2
## ------------------------------------------------------------------------------------------
head(home)
##   State Price Size Beds Baths
## 1    CA   533 1589    3   2.5
## 2    CA   610 2008    3   2.0
## 3    CA   899 2380    5   3.0
## 4    CA   929 1868    3   3.0
## 5    CA   210 1360    2   2.0
## 6    CA   268 2131    3   2.0

1. Introduction

This report explores how various features of homes (size, number of bedrooms, number of bathrooms) relate to their price using regression models. Additionally, it investigates whether home prices significantly differ by state.

All data was gathered from https://www.lock5stat.com/datasets3e/HomesForSale.csv

I will be performing an analysis of the data relating to the following questions:

  1. How much does the size of a home influence its price (California only)?
  2. How does the number of bedrooms influence the price (California only)?
  3. How does the number of bathrooms influence the price (California only)?
  4. How do size, bedrooms, and bathrooms jointly influence home price (California only)?
  5. Are there significant price differences between homes across CA, NY, NJ, and PA?

2. Analysis

Q1: How much does the size of a home influence its price (California only)?

home_CA <- subset(home, State == "CA")
# Simple linear regression: Price ~ Size
Regression(Price ~ Size, data=home_CA)

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## Regression(my_formula=Price ~ Size, data=home_CA, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  home_CA 
##  
## Response Variable: Price 
## Predictor Variable: Size 
##  
## Number of cases (rows) of data:  30 
## Number of cases retained for analysis:  30 
## 
## 
##   BASIC ANALYSIS 
## 
##              Estimate    Std Err  t-value  p-value   Lower 95%   Upper 95% 
## (Intercept)   -56.817    154.681   -0.367    0.716    -373.666     260.033 
##        Size     0.339      0.086    3.963    0.000       0.164       0.514 
## 
## Standard deviation of Price: 269.1774 
##  
## Standard deviation of residuals:  219.2566 for df=28 
## 95% range of residuals:  898.2537 = 2 * (2.048 * 219.2566) 
##  
## R-squared: 0.359    Adjusted R-squared: 0.337    PRESS R-squared: 0.273 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 15.709     df: 1 and 28     p-value:  0.000 
## 
## -- Analysis of Variance 
##  
##             df       Sum Sq     Mean Sq   F-value   p-value 
## Model        1   755179.865  755179.865    15.709     0.000 
## Residuals   28  1346057.102   48073.468 
## Price       29  2101236.967   72456.447 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##         Price Size 
##   Price  1.00 0.60 
##    Size  0.60 1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 30 rows of data, or do n_res_rows="all"] 
## ----------------------------------------------------------- 
##          Size    Price  fitted    resid rstdnt dffits cooks 
##   21     2224      235 697.546 -462.546 -2.356 -0.637 0.175 
##    7     2436     1095 769.454  325.546  1.615  0.556 0.146 
##    6     2131      268 666.001 -398.001 -1.961 -0.477 0.103 
##   12     2286      469 718.575 -249.575 -1.194 -0.347 0.059 
##   10     1371      700 408.215  291.785  1.391  0.335 0.054 
##    8     1375      699 409.572  289.428  1.379  0.330 0.053 
##    4     1868      929 576.793  352.207  1.689  0.325 0.049 
##   11     1440      145 431.619 -286.619 -1.360 -0.304 0.045 
##   24     1688      195 515.739 -320.739 -1.523 -0.285 0.039 
##   18     2479      929 784.039  144.961  0.696  0.251 0.032 
##    3     2380      899 750.459  148.541  0.706  0.228 0.026 
##   23      972      148 272.878 -124.878 -0.602 -0.227 0.026 
##    5     1360      210 404.484 -194.484 -0.910 -0.222 0.025 
##   17     1440      640 431.619  208.381  0.973  0.217 0.024 
##   20     1309      559 387.185  171.815  0.804  0.207 0.022 
##   25     1431      619 428.567  190.433  0.887  0.200 0.020 
##   29     1828      368 563.226 -195.226 -0.903 -0.170 0.015 
##   16     1419      285 424.496 -139.496 -0.646 -0.148 0.011 
##   28     2340      835 736.892   98.108  0.462  0.143 0.010 
##   15     1464      300 439.760 -139.760 -0.646 -0.141 0.010 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##          Size    Price    pred  s_pred   pi.lwr   pi.upr   width 
##   22      834      180 226.069 236.148 -257.658  709.796 967.454 
##   23      972      148 272.878 232.512 -203.402  749.158 952.560 
## ... 
##    1     1589      533 482.159 223.285   24.780  939.537 914.757 
##   24     1688      195 515.739 222.936   59.075  972.402 913.327 
##   29     1828      368 563.226 222.992  106.448 1020.004 913.556 
## ... 
##    7     2436     1095 769.454 230.574  297.145 1241.763 944.617 
##   14     2474      841 782.343 231.428  308.285 1256.402 948.118 
##   18     2479      929 784.039 231.544  309.744 1258.335 948.591 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------
# Scatter plot of Price vs Size with a regression line
Plot(Price, Size, data=home_CA, color="blue")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(Price, Size, enhance=TRUE)  # many options
## Plot(Price, Size, fill="skyblue")  # interior fill color of points
## Plot(Price, Size, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(Price, Size, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 30 
## Sample Correlation of Price and Size: r = 0.599 
##   
## Hypothesis Test of 0 Correlation:  t = 3.963,  df = 28,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.305 to 0.789 
## 

Q2: How does the number of bedrooms influence the price (California only)?

# Simple linear regression: Price ~ Beds
Regression(Price ~ Beds, data=home_CA)

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## Regression(my_formula=Price ~ Beds, data=home_CA, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  home_CA 
##  
## Response Variable: Price 
## Predictor Variable: Beds 
##  
## Number of cases (rows) of data:  30 
## Number of cases retained for analysis:  30 
## 
## 
##   BASIC ANALYSIS 
## 
##              Estimate    Std Err  t-value  p-value   Lower 95%   Upper 95% 
## (Intercept)   269.762    233.618    1.155    0.258    -208.782     748.306 
##        Beds    84.767     72.911    1.163    0.255     -64.583     234.118 
## 
## Standard deviation of Price: 269.1774 
##  
## Standard deviation of residuals:  267.5599 for df=28 
## 95% range of residuals:  1,096.143 = 2 * (2.048 * 267.5599) 
##  
## R-squared: 0.046    Adjusted R-squared: 0.012    PRESS R-squared: -0.128 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 1.352     df: 1 and 28     p-value:  0.255 
## 
## -- Analysis of Variance 
##  
##             df       Sum Sq    Mean Sq   F-value   p-value 
## Model        1    96764.729  96764.729     1.352     0.255 
## Residuals   28  2004472.238  71588.294 
## Price       29  2101236.967  72456.447 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##         Price Beds 
##   Price  1.00 0.21 
##    Beds  0.21 1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 30 rows of data, or do n_res_rows="all"] 
## -------------------------------------------------------- 
##       Beds    Price  fitted    resid rstdnt dffits cooks 
##   29     5      368 693.599 -325.599 -1.477 -0.948 0.432 
##    3     5      899 693.599  205.401  0.910  0.584 0.172 
##   24     4      195 608.832 -413.832 -1.672 -0.523 0.128 
##    7     3     1095 524.064  570.936  2.339  0.443 0.085 
##    8     2      699 439.297  259.703  1.041  0.400 0.080 
##   22     2      180 439.297 -259.297 -1.040 -0.400 0.080 
##   18     4      929 608.832  320.168  1.267  0.396 0.077 
##    5     2      210 439.297 -229.297 -0.915 -0.352 0.062 
##    4     3      929 524.064  404.936  1.581  0.300 0.043 
##   11     3      145 524.064 -379.064 -1.472 -0.279 0.037 
##   23     3      148 524.064 -376.064 -1.459 -0.276 0.037 
##   14     3      841 524.064  316.936  1.216  0.230 0.026 
##   28     3      835 524.064  310.936  1.192  0.226 0.025 
##   21     3      235 524.064 -289.064 -1.104 -0.209 0.022 
##    6     3      268 524.064 -256.064 -0.973 -0.184 0.017 
##   16     3      285 524.064 -239.064 -0.906 -0.172 0.015 
##   15     3      300 524.064 -224.064 -0.848 -0.161 0.013 
##    9     3      729 524.064  204.936  0.774  0.147 0.011 
##   19     4      714 608.832  105.168  0.406  0.127 0.008 
##   10     3      700 524.064  175.936  0.663  0.126 0.008 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##         Beds   Price    pred  s_pred   pi.lwr   pi.upr    width 
##      5     2     210 439.297 284.258 -142.979 1021.573 1164.552 
##      8     2     699 439.297 284.258 -142.979 1021.573 1164.552 
## ... 
##     22     2     180 439.297 284.258 -142.979 1021.573 1164.552 
## ... 
##   22.1     2     180 439.297 284.258 -142.979 1021.573 1164.552 
##      1     3     533 524.064 272.156  -33.423 1081.551 1114.974 
##      2     3     610 524.064 272.156  -33.423 1081.551 1114.974 
## ... 
##     24     4     195 608.832 279.227   36.862 1180.801 1143.939 
##      3     5     899 693.599 304.134   70.608 1316.590 1245.982 
##     29     5     368 693.599 304.134   70.608 1316.590 1245.982 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------
# Scatter plot of Price vs Beds
Plot(Price, Beds, data=home_CA, color="green")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(Price, Beds, enhance=TRUE)  # many options
## Plot(Price, Beds, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(Price, Beds, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 30 
## Sample Correlation of Price and Beds: r = 0.215 
##   
## Hypothesis Test of 0 Correlation:  t = 1.163,  df = 28,  p-value = 0.255 
## 95% Confidence Interval for Correlation:  -0.158 to 0.534 
## 

Q3: How does the number of bathrooms influence the price (California only)?

# Simple linear regression: Price ~ Baths
Regression(Price ~ Baths, data=home_CA)

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## Regression(my_formula=Price ~ Baths, data=home_CA, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  home_CA 
##  
## Response Variable: Price 
## Predictor Variable: Baths 
##  
## Number of cases (rows) of data:  30 
## Number of cases retained for analysis:  30 
## 
## 
##   BASIC ANALYSIS 
## 
##              Estimate    Std Err  t-value  p-value   Lower 95%   Upper 95% 
## (Intercept)    90.712    148.571    0.611    0.546    -213.622     395.047 
##       Baths   194.739     62.275    3.127    0.004      67.174     322.304 
## 
## Standard deviation of Price: 269.1774 
##  
## Standard deviation of residuals:  235.8384 for df=28 
## 95% range of residuals:  966.1862 = 2 * (2.048 * 235.8384) 
##  
## R-squared: 0.259    Adjusted R-squared: 0.232    PRESS R-squared: 0.149 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 9.779     df: 1 and 28     p-value:  0.004 
## 
## -- Analysis of Variance 
##  
##             df       Sum Sq     Mean Sq   F-value   p-value 
## Model        1   543883.601  543883.601     9.779     0.004 
## Residuals   28  1557353.365   55619.763 
## Price       29  2101236.967   72456.447 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##         Price Baths 
##   Price  1.00  0.51 
##   Baths  0.51  1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 30 rows of data, or do n_res_rows="all"] 
## -------------------------------------------------------- 
##      Baths    Price  fitted    resid rstdnt dffits cooks 
##    8 1.000      699 285.451  413.549  1.999  0.834 0.314 
##    7 2.000     1095 480.191  614.809  3.020  0.608 0.143 
##   15 3.000      300 674.930 -374.930 -1.703 -0.464 0.101 
##    9 4.000      729 869.669 -140.669 -0.677 -0.379 0.073 
##    4 3.000      929 674.930  254.070  1.122  0.306 0.046 
##   18 3.000      929 674.930  254.070  1.122  0.306 0.046 
##   11 2.000      145 480.191 -335.191 -1.480 -0.298 0.043 
##    3 3.000      899 674.930  224.070  0.984  0.268 0.036 
##   23 1.000      148 285.451 -137.451 -0.625 -0.260 0.035 
##   24 2.000      195 480.191 -285.191 -1.246 -0.251 0.031 
##   12 3.000      469 674.930 -205.930 -0.902 -0.246 0.030 
##    5 2.000      210 480.191 -270.191 -1.177 -0.237 0.028 
##   21 2.000      235 480.191 -245.191 -1.063 -0.214 0.023 
##   22 1.000      180 285.451 -105.451 -0.478 -0.199 0.020 
##   28 3.000      835 674.930  160.070  0.697  0.190 0.018 
##   10 2.000      700 480.191  219.809  0.949  0.191 0.018 
##    6 2.000      268 480.191 -212.191 -0.915 -0.184 0.017 
##   16 2.000      285 480.191 -195.191 -0.840 -0.169 0.014 
##   17 2.000      640 480.191  159.809  0.685  0.138 0.010 
##   14 3.500      841 772.299   68.701  0.308  0.123 0.008 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##      Baths   Price    pred  s_pred   pi.lwr   pi.upr    width 
##    8 1.000     699 285.451 252.707 -232.196  803.099 1035.294 
##   22 1.000     180 285.451 252.707 -232.196  803.099 1035.294 
##   23 1.000     148 285.451 252.707 -232.196  803.099 1035.294 
## ... 
##   30 2.000     360 480.191 240.385  -12.216  972.598  984.814 
##    1 2.500     533 577.560 240.116   85.704 1069.416  983.712 
##   25 2.500     619 577.560 240.116   85.704 1069.416  983.712 
## ... 
##   14 3.500     841 772.299 251.425  257.278 1287.320 1030.042 
##    9 4.000     729 869.669 262.493  331.976 1407.361 1075.385 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------
# Scatter plot of Price vs Baths
Plot(Price, Baths, data=home_CA, color="purple")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(Price, Baths, enhance=TRUE)  # many options
## Plot(Price, Baths, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(Price, Baths, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 30 
## Sample Correlation of Price and Baths: r = 0.509 
##   
## Hypothesis Test of 0 Correlation:  t = 3.127,  df = 28,  p-value = 0.004 
## 95% Confidence Interval for Correlation:  0.182 to 0.734 
## 

Q4: How do size, bedrooms, and bathrooms jointly influence home price (California only)?

# Multiple linear regression: Price ~ Size + Beds + Baths
Regression(Price ~ Size + Beds + Baths, data=home_CA)

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## Regression(my_formula=Price ~ Size + Beds + Baths, data=home_CA, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  home_CA 
##  
## Response Variable: Price 
## Predictor Variable 1: Size 
## Predictor Variable 2: Beds 
## Predictor Variable 3: Baths 
##  
## Number of cases (rows) of data:  30 
## Number of cases retained for analysis:  30 
## 
## 
##   BASIC ANALYSIS 
## 
##              Estimate    Std Err  t-value  p-value   Lower 95%   Upper 95% 
## (Intercept)   -41.561    210.381   -0.198    0.845    -474.005     390.883 
##        Size     0.281      0.119    2.364    0.026       0.037       0.526 
##        Beds   -33.704     67.926   -0.496    0.624    -173.326     105.919 
##       Baths    83.984     76.753    1.094    0.284     -73.784     241.752 
## 
## Standard deviation of Price: 269.1774 
##  
## Standard deviation of residuals:  221.8196 for df=26 
## 95% range of residuals:  911.9134 = 2 * (2.056 * 221.8196) 
##  
## R-squared: 0.391    Adjusted R-squared: 0.321    PRESS R-squared: 0.167 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 5.568     df: 3 and 26     p-value:  0.004 
## 
## -- Analysis of Variance 
##  
##             df       Sum Sq     Mean Sq   F-value   p-value 
##      Size    1   755179.865  755179.865    15.348     0.001 
##      Beds    1     7842.365    7842.365     0.159     0.693 
##     Baths    1    58912.405   58912.405     1.197     0.284 
##  
## Model        3   821934.634  273978.211     5.568     0.004 
## Residuals   26  1279302.332   49203.936 
## Price       29  2101236.967   72456.447 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##         Price Size Beds Baths 
##   Price  1.00 0.60 0.21  0.51 
##    Size  0.60 1.00 0.45  0.64 
##    Beds  0.21 0.45 1.00  0.35 
##   Baths  0.51 0.64 0.35  1.00 
## 
##         Tolerance       VIF 
##    Size     0.530     1.887 
##    Beds     0.792     1.263 
##   Baths     0.582     1.717 
## 
##  Size Beds Baths    R2adj    X's 
##     1    0     1    0.340      2 
##     1    0     0    0.337      1 
##     1    1     1    0.321      3 
##     1    1     0    0.316      2 
##     0    0     1    0.232      1 
##     0    1     1    0.206      2 
##     0    1     0    0.012      1 
##  
## [based on Thomas Lumley's leaps function from the leaps package] 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 30 rows of data, or do n_res_rows="all"] 
## ----------------------------------------------------------------------- 
##          Size  Beds Baths    Price  fitted    resid rstdnt dffits cooks 
##    7     2436     3 2.000     1095 710.063  384.937  2.108  1.159 0.297 
##    8     1375     2 1.000      699 361.532  337.468  1.791  0.944 0.205 
##   21     2224     3 2.000      235 650.469 -415.469 -2.165 -0.893 0.174 
##    3     2380     5 3.000      899 710.899  188.101  1.009  0.649 0.105 
##    6     2131     3 2.000      268 624.327 -356.327 -1.777 -0.642 0.095 
##   24     1688     4 2.000      195 466.095 -271.095 -1.322 -0.490 0.058 
##   15     1464     3 3.000      300 520.816 -220.816 -1.091 -0.477 0.057 
##   12     2286     3 3.000      469 751.882 -282.882 -1.372 -0.473 0.054 
##   29     1828     5 2.000      368 471.746 -103.746 -0.582 -0.447 0.051 
##    5     1360     2 2.000      210 441.300 -231.300 -1.125 -0.438 0.047 
##    4     1868     3 3.000      929 634.381  294.619  1.415  0.433 0.045 
##   10     1371     3 2.000      700 410.689  289.311  1.365  0.333 0.027 
##   11     1440     3 2.000      145 430.085 -285.085 -1.337 -0.300 0.022 
##   18     2479     4 3.000      929 772.431  156.569  0.750  0.290 0.021 
##   17     1440     3 2.000      640 430.085  209.915  0.969  0.217 0.012 
##   25     1431     3 2.500      619 469.547  149.453  0.695  0.205 0.011 
##   20     1309     3 2.000      559 393.261  165.739  0.767  0.204 0.011 
##   23      972     3 1.000      148 214.545  -66.545 -0.325 -0.151 0.006 
##   16     1419     3 2.000      285 424.182 -139.182 -0.636 -0.146 0.005 
##   28     2340     3 3.000      835 767.062   67.938  0.320  0.116 0.003 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##          Size  Beds Baths   Price    pred  s_pred  pi.lwr   pi.upr    width 
##   20     1309     3 2.000     559 393.261 229.039 -77.537  864.058  941.595 
##   29     1828     5 2.000     368 471.746 259.729 -62.135 1005.627 1067.762 
##   10     1371     3 2.000     700 410.689 227.980 -57.930  879.308  937.239 
## ... 
##   27     1477     3 2.000     408 440.486 226.713 -25.530  906.501  932.031 
##   26     1488     3 2.000     549 443.578 226.621 -22.249  909.405  931.654 
##   24     1688     4 2.000     195 466.095 234.819 -16.582  948.772  965.354 
## ... 
##   28     2340     3 3.000     835 767.062 234.364 285.319 1248.804  963.485 
##   18     2479     4 3.000     929 772.431 235.772 287.795 1257.068  969.273 
##   14     2474     3 3.500     841 846.722 241.681 349.940 1343.504  993.564 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------
# How Size influences price
Plot(Price, Size, data=home_CA, color="blue")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(Price, Size, enhance=TRUE)  # many options
## Plot(Price, Size, fill="skyblue")  # interior fill color of points
## Plot(Price, Size, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(Price, Size, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 30 
## Sample Correlation of Price and Size: r = 0.599 
##   
## Hypothesis Test of 0 Correlation:  t = 3.963,  df = 28,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.305 to 0.789 
## 
# How beds influence price
Plot(Price, Beds, data=home_CA, color="green")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(Price, Beds, enhance=TRUE)  # many options
## Plot(Price, Beds, fill="skyblue")  # interior fill color of points
## Plot(Price, Beds, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(Price, Beds, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 30 
## Sample Correlation of Price and Beds: r = 0.215 
##   
## Hypothesis Test of 0 Correlation:  t = 1.163,  df = 28,  p-value = 0.255 
## 95% Confidence Interval for Correlation:  -0.158 to 0.534 
## 
# How baths influence price
Plot(Price, Baths, data=home_CA, color="purple")

## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(Price, Baths, enhance=TRUE)  # many options
## Plot(Price, Baths, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(Price, Baths, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 30 
## Sample Correlation of Price and Baths: r = 0.509 
##   
## Hypothesis Test of 0 Correlation:  t = 3.127,  df = 28,  p-value = 0.004 
## 95% Confidence Interval for Correlation:  0.182 to 0.734 
## 

Q5: Are there significant price differences between homes across CA, NY, NJ, and PA?

ANOVA(Price ~ State, data=home)

## 
##   BACKGROUND 
## 
## Response Variable: Price 
##  
## Factor Variable: State 
##   Levels: CA NJ NY PA 
##  
## Number of cases (rows) of data:  120 
## Number of cases retained for analysis:  120 
## 
## 
##   DESCRIPTIVE STATISTICS  
## 
##      n     mean       sd      min       max 
## CA  30   535.37   269.18   145.00   1095.00 
## NJ  30   328.53   157.97   115.00    650.00 
## NY  30   365.33   317.82    35.00   1250.00 
## PA  30   265.57   137.09    50.00    550.00 
##  
## Grand Mean: 373.7 
## 
## 
##   ANOVA 
## 
##              df      Sum Sq    Mean Sq   F-value   p-value 
## State         3  1198168.73  399389.58      7.35    0.0001 
## Residuals   116  6299266.47   54304.02 
## 
## R Squared: 0.160 
## R Sq Adjusted: 0.138 
## Omega Squared: 0.137 
##  
## 
## Cohen's f: 0.399 
## 
## 
##   TUKEY MULTIPLE COMPARISONS OF MEANS 
## 
## Family-wise Confidence Level: 0.95 
## ------------------------------------- 
##            diff     lwr     upr p adj 
##   NJ-CA -206.83 -363.67  -49.99  0.00 
##   NY-CA -170.03 -326.87  -13.19  0.03 
##   PA-CA -269.80 -426.64 -112.96  0.00 
##   NY-NJ   36.80 -120.04  193.64  0.93 
##   PA-NJ  -62.97 -219.81   93.87  0.72 
##   PA-NY  -99.77 -256.61   57.07  0.35 
## 
## 
##   RESIDUALS 
## 
## Fitted Values, Residuals, Standardized Residuals 
##    [sorted by Standardized Residuals, ignoring + or - sign] 
##    [res_rows = 20, out of 120 cases (rows) of data, or res_rows="all"] 
## ------------------------------------------ 
##      State   Price fitted residual z-resid 
##   63    NY    1250 365.33   884.67    3.86 
##   86    NY    1000 365.33   634.67    2.77 
##   87    NY     929 365.33   563.67    2.46 
##    7    CA    1095 535.37   559.63    2.44 
##   88    NY     875 365.33   509.67    2.22 
##   64    NY     825 365.33   459.67    2.01 
##   75    NY     775 365.33   409.67    1.79 
##   18    CA     929 535.37   393.63    1.72 
##    4    CA     929 535.37   393.63    1.72 
##   11    CA     145 535.37  -390.37   -1.70 
##   23    CA     148 535.37  -387.37   -1.69 
##    3    CA     899 535.37   363.63    1.59 
##   22    CA     180 535.37  -355.37   -1.55 
##   24    CA     195 535.37  -340.37   -1.49 
##   65    NY      35 365.33  -330.33   -1.44 
##    5    CA     210 535.37  -325.37   -1.42 
##   54    NJ     650 328.53   321.47    1.40 
##   57    NJ     639 328.53   310.47    1.36 
##   14    CA     841 535.37   305.63    1.33 
##   21    CA     235 535.37  -300.37   -1.31 
## 
## ---------------------------------------- 
## Plot 1: 95% family-wise confidence level 
## Plot 2: Scatterplot with Cell Means 
## ----------------------------------------
# Boxplot of Price by State for visual comparison
Plot(State, Price, data=home, fill="skyblue", main="Home Prices by State (Boxplot)")

## 
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(State, Price, data=home, fill="skyblue", main="Home Prices by State (Boxplot)", means=FALSE)  # do not plot means
## Plot(State, Price, data=home, fill="skyblue", main="Home Prices by State (Boxplot)", stat="mean")  # only plot means
## ANOVA(Price ~ State)  # inferential analysis 
## 
## Price 
##   - by levels of - 
## State 
##  
##       n   miss       mean         sd        min        mdn        max 
## CA   30      0    535.367    269.177    145.000    554.000   1095.000 
## NJ   30      0    328.533    157.973    115.000    279.000    650.000 
## NY   30      0    365.333    317.822     35.000    256.500   1250.000 
## PA   30      0    265.567    137.089     50.000    232.500    550.000 
##