1. Data cleanup, including decision to remove/drop records (if applicable);

data <- read.csv("~/Desktop/Big data and Econometrics/Building_Permits.csv")
head(data)
##   Assessor.Book Assessor.Page Assessor.Parcel
## 1          2354           016             056
## 2          5017           025             015
## 3          5478           017             040
## 4          5153           026             004
## 5          4422           006             007
## 6          4260           017             024
##                                                                              Tract
## 1                                                                          TR 4295
## 2                                                    MILHOUSE VERMONT AVENUE TRACT
## 3                                                                          TR 2596
## 4 DAY, HINTON AND MATHES RESUBDIVISION OF LOTS 1, 2, 3, 4, 5, 6 AND INCLUDED ALLEY
## 5                                                                          TR 9300
## 6                                                                          TR 8369
##   Block Lot Reference....Old.Permit...     PCIS.Permit..         Status
## 1       225                            13041-10000-24108 Permit Finaled
## 2         3                            20041-10000-19010         Issued
## 3        99                            19041-10000-34160 Permit Finaled
## 4         4                            21041-10000-92658         Issued
## 5    80   3                  20WL05483 16030-30001-00598 Permit Finaled
## 6        60                            20043-10000-04664 Permit Finaled
##              Status.Date       Permit.Type        Permit.Sub.Type
## 1 09/12/2013 12:00:00 AM        Electrical 1 or 2 Family Dwelling
## 2 06/10/2020 12:00:00 AM        Electrical             Commercial
## 3 07/07/2020 12:00:00 AM        Electrical 1 or 2 Family Dwelling
## 4 10/27/2021 12:00:00 AM        Electrical              Apartment
## 5 06/17/2020 12:00:00 AM Bldg-Alter/Repair 1 or 2 Family Dwelling
## 6 10/27/2020 12:00:00 AM    Fire Sprinkler             Commercial
##   Permit.Category Project.Number Event.Code Initiating.Office Issue.Date
## 1   No Plan Check             NA                        METRO 09/04/2013
## 2   No Plan Check             NA                        METRO 06/10/2020
## 3   No Plan Check             NA                        METRO 08/29/2019
## 4   No Plan Check             NA                        METRO 10/27/2021
## 5   No Plan Check             NA                      WEST LA 02/25/2020
## 6      Plan Check             NA                        METRO 09/24/2020
##   Address.Start Address.Fraction.Start Address.End Address.Fraction.End
## 1         11551                              11551                     
## 2          5129                               5129                     
## 3          5224                               5224                     
## 4           307                                307                     
## 5          1325                               1325                     
## 6         11400                              11400                     
##   Street.Direction Street.Name Street.Suffix Suffix.Direction Unit.Range.Start
## 1                W       KLING            ST                                  
## 2                S     VERMONT           AVE                                  
## 3                E         HUB            ST                                  
## 4                S      WITMER            ST                               106
## 5                N  CHAUTAUQUA          BLVD                                  
## 6                W     OLYMPIC          BLVD                              1530
##   Unit.Range.End Zip.Code
## 1                   91602
## 2                   90037
## 3                   90042
## 4                   90017
## 5                   90272
## 6                   90064
##                                                        Work.Description
## 1                                                                      
## 2                                                                      
## 3                                                                      
## 4                                                                      
## 5 SUPPLEMENTAL TO PERMIT #16030-20000-00598 TO CHANGE OWNER INFORMATION
## 6           Relocate 7 and add 4  fire sprinkler heads as part of a TI.
##   Valuation Floor.Area.L.A..Zoning.Code.Definition
## 1        NA                                     NA
## 2        NA                                     NA
## 3        NA                                     NA
## 4        NA                                     NA
## 5         0                                     NA
## 6        NA                                     NA
##   X..of.Residential.Dwelling.Units X..of.Stories  Contractor.s.Business.Name
## 1                               NA            NA             HIRSCH ELECTRIC
## 2                               NA            NA               MENDOZA OSCAR
## 3                               NA            NA               OWNER-BUILDER
## 4                               NA            NA          SKY REMODELING INC
## 5                               NA            NA               OWNER-BUILDER
## 6                               NA            NA REGENCY FIRE PROTECTION INC
##         Contractor.Address Contractor.City Contractor.State License.Type
## 1    102 N HAYWORTH AVE #4     LOS ANGELES               CA          C10
## 2        152 1/2 E 73RD ST     LOS ANGELES               CA          C10
## 3                                                                   <NA>
## 4 12439 MAGNOLIA BLVD #245  VALLEY VILLAGE               CA            B
## 5                                                                   <NA>
## 6                                                                    C16
##   License.. Principal.First.Name Principal.Middle.Name Principal.Last.Name
## 1    743608               KAROLY                                 HIDEGKUTI
## 2   1044737                OSCAR                                   MENDOZA
## 3         0                                                               
## 4    955109               TZVIKA               MICHAEL               DINER
## 5         0                                                               
## 6    523116                                                      TAL DAGAN
##   License.Expiration.Date Applicant.First.Name Applicant.Last.Name
## 1              12/31/2013                                         
## 2              09/30/2020                OSCAR             MENDOZA
## 3              01/01/2009                JASON               LEUNG
## 4              11/30/2022                  RAZ            GRINBAUM
## 5                                         JOEL              TORRES
## 6                                         Liat              Kivity
##   Applicant.Business.Name  Applicant.Address.1 Applicant.Address.2
## 1                                                                 
## 2                                                                 
## 3           OWNER-BUILDER                                         
## 4                                                                 
## 5                                                                 
## 6                         7651 Densmore avenue                    
##   Applicant.Address.3        Zone Occupancy
## 1                            R1-1          
## 2                     C2-1VL-CPIO          
## 3                            R1-1          
## 4                            R3-1          
## 5                            R1-1          
## 6            Van Nuys        M2-1          
##   Floor.Area.L.A..Building.Code.Definition Census.Tract     Latitude.Longitude
## 1                                        0      1433.00  (34.1559, -118.38389)
## 2                                        0      2323.00 (33.99616, -118.29167)
## 3                                        0      1835.10 (34.11679, -118.20276)
## 4                                        0      2091.03 (34.05879, -118.26377)
## 5                                       NA      2625.01 (34.05319, -118.52005)
## 6                                        0      2677.00 (34.03666, -118.44346)

1.1 Take a look at the data as a whole with the skim function

skimr::skim(data)
Data summary
Name data
Number of rows 1526025
Number of columns 54
_______________________
Column type frequency:
character 43
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Assessor.Book 0 1.00 0 4 1517 1172 0
Assessor.Page 0 1.00 0 3 1517 117 0
Assessor.Parcel 0 1.00 0 3 1517 378 0
Tract 0 1.00 0 160 5974 24362 0
Block 0 1.00 0 6 1232583 326 0
Lot 0 1.00 0 40 10136 4435 0
Reference….Old.Permit… 0 1.00 0 9 868680 459335 0
PCIS.Permit.. 0 1.00 17 17 0 1524175 0
Status 0 1.00 6 24 0 32 0
Status.Date 0 1.00 22 22 0 3475 0
Permit.Type 0 1.00 4 20 0 18 0
Permit.Sub.Type 0 1.00 6 22 0 7 0
Permit.Category 0 1.00 10 13 0 2 0
Event.Code 0 1.00 0 6 1526021 4 0
Initiating.Office 0 1.00 3 8 0 7 0
Issue.Date 0 1.00 10 10 0 3538 0
Address.Fraction.Start 0 1.00 0 3 1503645 45 0
Address.Fraction.End 0 1.00 0 3 1486786 64 0
Street.Direction 0 1.00 0 1 14492 5 0
Street.Name 0 1.00 0 29 11 7792 0
Street.Suffix 0 1.00 0 4 31134 41 0
Suffix.Direction 0 1.00 0 5 1516970 13 0
Unit.Range.Start 2 1.00 0 10 1233069 44123 0
Unit.Range.End 0 1.00 0 10 1493161 7528 0
Work.Description 0 1.00 0 70 837304 467679 0
Contractor.s.Business.Name 0 1.00 0 65 5617 45543 0
Contractor.Address 0 1.00 0 40 212681 57571 0
Contractor.City 0 1.00 0 25 218474 1751 0
Contractor.State 0 1.00 0 2 218588 60 0
License.Type 205076 0.87 0 3 29 110 0
Principal.First.Name 0 1.00 0 15 197961 9879 0
Principal.Middle.Name 4 1.00 0 12 598558 8118 0
Principal.Last.Name 1 1.00 0 35 195049 25218 0
License.Expiration.Date 0 1.00 0 10 153517 951 0
Applicant.First.Name 7 1.00 0 15 233528 39617 0
Applicant.Last.Name 12 1.00 0 15 398825 59535 0
Applicant.Business.Name 0 1.00 0 30 1394767 22790 0
Applicant.Address.1 16 1.00 0 30 652230 118491 0
Applicant.Address.2 1 1.00 0 30 1275200 7507 0
Applicant.Address.3 17 1.00 0 30 662209 8652 0
Zone 0 1.00 0 34 3739 1919 0
Occupancy 0 1.00 0 16 1503145 29 0
Latitude.Longitude 0 1.00 0 22 80838 393225 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Project.Number 1512410 0.01 4.40 1.41 1.0 5.00 5.00 5.00 5.00 ▂▁▁▁▇
Address.Start 16 1.00 6666.63 6041.17 1.0 1515.00 4921.00 10580.00 83011.00 ▇▁▁▁▁
Address.End 21 1.00 6667.16 6042.74 1.0 1515.00 4921.00 10580.00 84113.00 ▇▁▁▁▁
Zip.Code 94 1.00 90560.85 905.25 0.0 90033.00 90067.00 91331.00 91803.00 ▁▁▁▁▇
Valuation 947162 0.38 143943.65 2802822.29 0.0 2200.00 8000.00 30000.00 583800300.00 ▇▁▁▁▁
Floor.Area.L.A..Zoning.Code.Definition 1394471 0.09 3622.17 32160.31 -302813.0 77.00 542.00 2105.00 5107724.00 ▇▁▁▁▁
X..of.Residential.Dwelling.Units 1456737 0.05 1.79 18.97 -147.0 0.00 0.00 1.00 1150.00 ▇▁▁▁▁
X..of.Stories 1395910 0.09 1.55 26.91 -3.0 0.00 1.00 2.00 4654.00 ▇▁▁▁▁
License.. 29 1.00 690006.70 359607.61 0.0 505706.00 806827.00 961891.00 88318213.00 ▇▁▁▁▁
Floor.Area.L.A..Building.Code.Definition 323167 0.79 295.32 11028.13 -826056.0 0.00 0.00 0.00 5214880.00 ▇▁▁▁▁
Census.Tract 1247 1.00 2045.53 1160.25 1011.1 1341.01 1959.02 2397.02 9800.31 ▇▁▁▁▁
names(data)
##  [1] "Assessor.Book"                           
##  [2] "Assessor.Page"                           
##  [3] "Assessor.Parcel"                         
##  [4] "Tract"                                   
##  [5] "Block"                                   
##  [6] "Lot"                                     
##  [7] "Reference....Old.Permit..."              
##  [8] "PCIS.Permit.."                           
##  [9] "Status"                                  
## [10] "Status.Date"                             
## [11] "Permit.Type"                             
## [12] "Permit.Sub.Type"                         
## [13] "Permit.Category"                         
## [14] "Project.Number"                          
## [15] "Event.Code"                              
## [16] "Initiating.Office"                       
## [17] "Issue.Date"                              
## [18] "Address.Start"                           
## [19] "Address.Fraction.Start"                  
## [20] "Address.End"                             
## [21] "Address.Fraction.End"                    
## [22] "Street.Direction"                        
## [23] "Street.Name"                             
## [24] "Street.Suffix"                           
## [25] "Suffix.Direction"                        
## [26] "Unit.Range.Start"                        
## [27] "Unit.Range.End"                          
## [28] "Zip.Code"                                
## [29] "Work.Description"                        
## [30] "Valuation"                               
## [31] "Floor.Area.L.A..Zoning.Code.Definition"  
## [32] "X..of.Residential.Dwelling.Units"        
## [33] "X..of.Stories"                           
## [34] "Contractor.s.Business.Name"              
## [35] "Contractor.Address"                      
## [36] "Contractor.City"                         
## [37] "Contractor.State"                        
## [38] "License.Type"                            
## [39] "License.."                               
## [40] "Principal.First.Name"                    
## [41] "Principal.Middle.Name"                   
## [42] "Principal.Last.Name"                     
## [43] "License.Expiration.Date"                 
## [44] "Applicant.First.Name"                    
## [45] "Applicant.Last.Name"                     
## [46] "Applicant.Business.Name"                 
## [47] "Applicant.Address.1"                     
## [48] "Applicant.Address.2"                     
## [49] "Applicant.Address.3"                     
## [50] "Zone"                                    
## [51] "Occupancy"                               
## [52] "Floor.Area.L.A..Building.Code.Definition"
## [53] "Census.Tract"                            
## [54] "Latitude.Longitude"

1.2 Standardize data column names

data <- data %>% 
  janitor::clean_names()

1.3 Unified Date Format

attach(data)
data$status_date <- as.Date(status_date, "%m/%d/%Y")
data$issue_date <- as.Date(issue_date, "%m/%d/%Y")

1.4 Add length of time variable

data$diff_dates <- difftime(dmy(substr(status_date,1,10)) ,dmy(issue_date), units = 'days')
summary(data$diff_dates)
##   Length    Class     Mode 
##  1526025 difftime  numeric

2. Number of unique permits;

data %>% 
  select(matches("permit")) %>%  
  head()
##   reference_old_permit       pcis_permit       permit_type
## 1                      13041-10000-24108        Electrical
## 2                      20041-10000-19010        Electrical
## 3                      19041-10000-34160        Electrical
## 4                      21041-10000-92658        Electrical
## 5            20WL05483 16030-30001-00598 Bldg-Alter/Repair
## 6                      20043-10000-04664    Fire Sprinkler
##          permit_sub_type permit_category
## 1 1 or 2 Family Dwelling   No Plan Check
## 2             Commercial   No Plan Check
## 3 1 or 2 Family Dwelling   No Plan Check
## 4              Apartment   No Plan Check
## 5 1 or 2 Family Dwelling   No Plan Check
## 6             Commercial      Plan Check

2.1 Number of unique permit type

data %>%
  count(permit_type)
##             permit_type      n
## 1         Bldg-Addition  52750
## 2     Bldg-Alter/Repair 360166
## 3       Bldg-Demolition  19966
## 4              Bldg-New  36750
## 5       Bldg-Relocation     13
## 6            Electrical 453402
## 7              Elevator  19237
## 8        Fire Sprinkler  58149
## 9               Grading  30048
## 10                 HVAC 144381
## 11     Nonbldg-Addition    259
## 12 Nonbldg-Alter/Repair   7264
## 13   Nonbldg-Demolition     94
## 14          Nonbldg-New  24870
## 15             Plumbing 276037
## 16      Pressure Vessel   2836
## 17                 Sign  17849
## 18    Swimming-Pool/Spa  21954

There are 18 unique permit types.

2.2 Number of unique permit sub-type

data %>%
  count(permit_sub_type)
##          permit_sub_type      n
## 1 1 or 2 Family Dwelling 875690
## 2              Apartment 269854
## 3             Commercial 352588
## 4                Offsite    354
## 5                 Onsite  17495
## 6     Public Safety Only   2753
## 7      Special Equipment   7291

There are 7 unique permit sub-types.

2.3 Number of unique permit category

data %>%
  count(permit_category)
##   permit_category       n
## 1   No Plan Check 1016559
## 2      Plan Check  509466

There are 2 unique permit categories.

3. Distribution of updates (records) per permit;

result1 <- data.frame(data %>%
                        count(permit_type))

data.frame(table(data$permit_type))%>%
  ggplot(aes(Var1,Freq))+
  geom_col()+
  theme(axis.text.x  = element_text(angle = 60,hjust = 1))+
  geom_text(aes(label = Freq),size = 3,vjust =-0.5)+
  labs(x = "Permit type",y = "Frequency",title = "Histgram of permit type")

data.frame(table(data$permit_sub_type))%>%
  ggplot(aes(Var1,Freq))+
  geom_col()+
  geom_text(aes(label = Freq),size = 3,vjust =-0.5)+
  theme(axis.text.x  = element_text(angle = 60,hjust = 1))+
  labs(x = "permit_sub_type",y = "Frequency",title = "Histgram of permit sub type")

data.frame(table(data$permit_category))%>%
  ggplot(aes(Var1,Freq))+
  geom_col()+
  geom_text(aes(label = Freq),size = 3,vjust =-0.5)+
  theme(axis.text.x  = element_text(angle = 60,hjust = 1))+
  labs(x = 'permit_category',y = "Frequency",title = "Histgram of permit category")

4. Length of time (days) from issue to completion, by type (Permit Type,Permit Sub-Type,Permit Category) of permit.

4.1 Remove rows with diff_dates less than 0 or NA and put the newly obtained data into data2.

data2 <- data%>%filter(diff_dates >= 0)
data2 <- data2[-which(is.na(data$diff_dates)),]

4.3 Average length of time (days) from issue to completion by permit type (Permit Type,Permit Sub-Type,Permit Category)

aggregate(data = data2, diff_dates ~ permit_type + permit_sub_type + permit_category,mean)
##             permit_type        permit_sub_type permit_category     diff_dates
## 1         Bldg-Addition 1 or 2 Family Dwelling   No Plan Check  63.00000 days
## 2     Bldg-Alter/Repair 1 or 2 Family Dwelling   No Plan Check 163.08475 days
## 3       Bldg-Demolition 1 or 2 Family Dwelling   No Plan Check 293.66667 days
## 4            Electrical 1 or 2 Family Dwelling   No Plan Check 169.27650 days
## 5              Elevator 1 or 2 Family Dwelling   No Plan Check 267.75000 days
## 6        Fire Sprinkler 1 or 2 Family Dwelling   No Plan Check 137.32110 days
## 7               Grading 1 or 2 Family Dwelling   No Plan Check 401.43750 days
## 8                  HVAC 1 or 2 Family Dwelling   No Plan Check 162.91245 days
## 9  Nonbldg-Alter/Repair 1 or 2 Family Dwelling   No Plan Check 116.93333 days
## 10          Nonbldg-New 1 or 2 Family Dwelling   No Plan Check  98.28000 days
## 11             Plumbing 1 or 2 Family Dwelling   No Plan Check 148.20603 days
## 12      Pressure Vessel 1 or 2 Family Dwelling   No Plan Check 390.00000 days
## 13    Swimming-Pool/Spa 1 or 2 Family Dwelling   No Plan Check 221.17391 days
## 14    Bldg-Alter/Repair              Apartment   No Plan Check 157.57729 days
## 15           Electrical              Apartment   No Plan Check 152.15316 days
## 16             Elevator              Apartment   No Plan Check 121.73869 days
## 17       Fire Sprinkler              Apartment   No Plan Check 185.02778 days
## 18              Grading              Apartment   No Plan Check 503.40000 days
## 19                 HVAC              Apartment   No Plan Check 170.25237 days
## 20 Nonbldg-Alter/Repair              Apartment   No Plan Check 135.00000 days
## 21          Nonbldg-New              Apartment   No Plan Check  77.80000 days
## 22             Plumbing              Apartment   No Plan Check 173.98746 days
## 23      Pressure Vessel              Apartment   No Plan Check 135.21622 days
## 24    Swimming-Pool/Spa              Apartment   No Plan Check   0.00000 days
## 25    Bldg-Alter/Repair             Commercial   No Plan Check 162.52069 days
## 26           Electrical             Commercial   No Plan Check 171.72459 days
## 27             Elevator             Commercial   No Plan Check 132.12937 days
## 28       Fire Sprinkler             Commercial   No Plan Check 166.75266 days
## 29              Grading             Commercial   No Plan Check 227.62500 days
## 30                 HVAC             Commercial   No Plan Check 169.77027 days
## 31 Nonbldg-Alter/Repair             Commercial   No Plan Check 121.10000 days
## 32          Nonbldg-New             Commercial   No Plan Check 212.88235 days
## 33             Plumbing             Commercial   No Plan Check 169.94178 days
## 34      Pressure Vessel             Commercial   No Plan Check 123.41667 days
## 35                 Sign                 Onsite   No Plan Check 144.12500 days
## 36           Electrical     Public Safety Only   No Plan Check  93.94355 days
## 37           Electrical      Special Equipment   No Plan Check 215.27333 days
## 38        Bldg-Addition 1 or 2 Family Dwelling      Plan Check 269.50916 days
## 39    Bldg-Alter/Repair 1 or 2 Family Dwelling      Plan Check 210.99249 days
## 40      Bldg-Demolition 1 or 2 Family Dwelling      Plan Check 205.23956 days
## 41             Bldg-New 1 or 2 Family Dwelling      Plan Check 289.40635 days
## 42           Electrical 1 or 2 Family Dwelling      Plan Check 154.77431 days
## 43             Elevator 1 or 2 Family Dwelling      Plan Check 291.10417 days
## 44       Fire Sprinkler 1 or 2 Family Dwelling      Plan Check 274.57551 days
## 45              Grading 1 or 2 Family Dwelling      Plan Check 306.84497 days
## 46                 HVAC 1 or 2 Family Dwelling      Plan Check   0.00000 days
## 47     Nonbldg-Addition 1 or 2 Family Dwelling      Plan Check 178.37500 days
## 48 Nonbldg-Alter/Repair 1 or 2 Family Dwelling      Plan Check 237.44776 days
## 49          Nonbldg-New 1 or 2 Family Dwelling      Plan Check 264.37220 days
## 50             Plumbing 1 or 2 Family Dwelling      Plan Check 401.96552 days
## 51    Swimming-Pool/Spa 1 or 2 Family Dwelling      Plan Check 245.53598 days
## 52        Bldg-Addition              Apartment      Plan Check 352.25581 days
## 53    Bldg-Alter/Repair              Apartment      Plan Check 225.92746 days
## 54      Bldg-Demolition              Apartment      Plan Check 292.38636 days
## 55             Bldg-New              Apartment      Plan Check 395.55556 days
## 56           Electrical              Apartment      Plan Check 221.45545 days
## 57             Elevator              Apartment      Plan Check 244.46377 days
## 58       Fire Sprinkler              Apartment      Plan Check 319.87179 days
## 59              Grading              Apartment      Plan Check 297.53061 days
## 60                 HVAC              Apartment      Plan Check 374.89655 days
## 61 Nonbldg-Alter/Repair              Apartment      Plan Check 220.87500 days
## 62          Nonbldg-New              Apartment      Plan Check 394.99029 days
## 63             Plumbing              Apartment      Plan Check 288.81166 days
## 64    Swimming-Pool/Spa              Apartment      Plan Check 244.64706 days
## 65        Bldg-Addition             Commercial      Plan Check 307.53659 days
## 66    Bldg-Alter/Repair             Commercial      Plan Check 213.08066 days
## 67      Bldg-Demolition             Commercial      Plan Check 226.55319 days
## 68             Bldg-New             Commercial      Plan Check 279.96923 days
## 69           Electrical             Commercial      Plan Check 183.56192 days
## 70             Elevator             Commercial      Plan Check 217.42466 days
## 71       Fire Sprinkler             Commercial      Plan Check 235.33618 days
## 72              Grading             Commercial      Plan Check 353.51818 days
## 73                 HVAC             Commercial      Plan Check 200.52609 days
## 74     Nonbldg-Addition             Commercial      Plan Check   0.00000 days
## 75 Nonbldg-Alter/Repair             Commercial      Plan Check 165.46364 days
## 76   Nonbldg-Demolition             Commercial      Plan Check 282.16667 days
## 77          Nonbldg-New             Commercial      Plan Check 290.52083 days
## 78             Plumbing             Commercial      Plan Check 221.61246 days
## 79    Swimming-Pool/Spa             Commercial      Plan Check 235.37500 days
## 80                 Sign                Offsite      Plan Check 282.00000 days
## 81                 Sign                 Onsite      Plan Check 139.10940 days
## 82           Electrical     Public Safety Only      Plan Check 303.80000 days
## 83           Electrical      Special Equipment      Plan Check   0.00000 days

5. Build a model to estimate the length of time for permit finalization, based on characteristics in this dataset.

5.1 Preliminarily screen some useful variables before modeling and remove meaningless variables

data3 <- data2[,c(1:3,6,11:13,16,22,55)]

5.2 Data cleaning in this step

data3 <- data3%>%filter(assessor_parcel != "***")  # remove "***"
data3$lot <- str_match(data3$lot,'[0-9]') # match value

5.2.1 Transform some variables into numerical form.

data3$diff_dates <- as.numeric(data3$diff_dates)
data3$assessor_page <- as.numeric(data3$assessor_page)
data3$assessor_parcel <- as.numeric(data3$assessor_parcel)
data3$lot <- as.numeric(data3$lot)

5.2.2 Remove NA in data3

data3 <- na.omit(data3) 

5.3 # Random split the data into 70% for learning and 30% for testing.

set.seed(1) # set seed
n <- sample(nrow(data3),nrow(data3)*0.7)
train <- data3[n,] # train set
test <- data3[-n,] # test set

5.4 Modeling

5.4.1 Random forest model

rf <- randomForest(diff_dates ~., data = train) #random forest model
p.rf <- predict(rf,test) # predict value
rmse.rf <- sqrt(mean((p.rf - test$diff_dates)^2));rmse.rf # calculate rmse
## [1] 292.7875

5.4.2 Lasso regression

train.x <- data.matrix(train[,-10])
train.y <- data.matrix(train$diff_dates)
test.x <- data.matrix(test[,-10])

lm.lasso <- cv.glmnet(train.x,train.y,alpha=1,family = 'gaussian') # modeling
plot(lm.lasso)

p.reg <- predict(lm.lasso,test.x,s=lm.lasso$lambda.min) # predict value, lambda chooses the value with the smallest error
rmse.las.reg <- sqrt(mean((p.reg - test$diff_dates)^2));rmse.las.reg # # calculate rmse
## [1] 293.7309

5.4.3 stepwise regression

m3 <- lm(diff_dates ~., data = train)
sr <- step(m3,trace = 0)
p.sr <- predict(sr,test) # predict value
rmse.sr <- sqrt(mean((p.sr - test$diff_dates)^2));rmse.sr# calculate rmse
## [1] 292.4311

5.4.4 Model test error comparison

cp <- data.frame(model_name = c('random forest model',
                                'lasso regression',
                                'stepwise regression'),
                 RMSE = c(rmse.rf,rmse.las.reg,rmse.sr))
knitr::kable(cp)
model_name RMSE
random forest model 292.7875
lasso regression 293.7309
stepwise regression 292.4311

The difference between the RMSE results obtained by these three methods is not very large, and if we have to choose one, we can see that the stepwise regression results are relatively better.