data <- read.csv("~/Desktop/Big data and Econometrics/Building_Permits.csv")
head(data)
## Assessor.Book Assessor.Page Assessor.Parcel
## 1 2354 016 056
## 2 5017 025 015
## 3 5478 017 040
## 4 5153 026 004
## 5 4422 006 007
## 6 4260 017 024
## Tract
## 1 TR 4295
## 2 MILHOUSE VERMONT AVENUE TRACT
## 3 TR 2596
## 4 DAY, HINTON AND MATHES RESUBDIVISION OF LOTS 1, 2, 3, 4, 5, 6 AND INCLUDED ALLEY
## 5 TR 9300
## 6 TR 8369
## Block Lot Reference....Old.Permit... PCIS.Permit.. Status
## 1 225 13041-10000-24108 Permit Finaled
## 2 3 20041-10000-19010 Issued
## 3 99 19041-10000-34160 Permit Finaled
## 4 4 21041-10000-92658 Issued
## 5 80 3 20WL05483 16030-30001-00598 Permit Finaled
## 6 60 20043-10000-04664 Permit Finaled
## Status.Date Permit.Type Permit.Sub.Type
## 1 09/12/2013 12:00:00 AM Electrical 1 or 2 Family Dwelling
## 2 06/10/2020 12:00:00 AM Electrical Commercial
## 3 07/07/2020 12:00:00 AM Electrical 1 or 2 Family Dwelling
## 4 10/27/2021 12:00:00 AM Electrical Apartment
## 5 06/17/2020 12:00:00 AM Bldg-Alter/Repair 1 or 2 Family Dwelling
## 6 10/27/2020 12:00:00 AM Fire Sprinkler Commercial
## Permit.Category Project.Number Event.Code Initiating.Office Issue.Date
## 1 No Plan Check NA METRO 09/04/2013
## 2 No Plan Check NA METRO 06/10/2020
## 3 No Plan Check NA METRO 08/29/2019
## 4 No Plan Check NA METRO 10/27/2021
## 5 No Plan Check NA WEST LA 02/25/2020
## 6 Plan Check NA METRO 09/24/2020
## Address.Start Address.Fraction.Start Address.End Address.Fraction.End
## 1 11551 11551
## 2 5129 5129
## 3 5224 5224
## 4 307 307
## 5 1325 1325
## 6 11400 11400
## Street.Direction Street.Name Street.Suffix Suffix.Direction Unit.Range.Start
## 1 W KLING ST
## 2 S VERMONT AVE
## 3 E HUB ST
## 4 S WITMER ST 106
## 5 N CHAUTAUQUA BLVD
## 6 W OLYMPIC BLVD 1530
## Unit.Range.End Zip.Code
## 1 91602
## 2 90037
## 3 90042
## 4 90017
## 5 90272
## 6 90064
## Work.Description
## 1
## 2
## 3
## 4
## 5 SUPPLEMENTAL TO PERMIT #16030-20000-00598 TO CHANGE OWNER INFORMATION
## 6 Relocate 7 and add 4 fire sprinkler heads as part of a TI.
## Valuation Floor.Area.L.A..Zoning.Code.Definition
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 0 NA
## 6 NA NA
## X..of.Residential.Dwelling.Units X..of.Stories Contractor.s.Business.Name
## 1 NA NA HIRSCH ELECTRIC
## 2 NA NA MENDOZA OSCAR
## 3 NA NA OWNER-BUILDER
## 4 NA NA SKY REMODELING INC
## 5 NA NA OWNER-BUILDER
## 6 NA NA REGENCY FIRE PROTECTION INC
## Contractor.Address Contractor.City Contractor.State License.Type
## 1 102 N HAYWORTH AVE #4 LOS ANGELES CA C10
## 2 152 1/2 E 73RD ST LOS ANGELES CA C10
## 3 <NA>
## 4 12439 MAGNOLIA BLVD #245 VALLEY VILLAGE CA B
## 5 <NA>
## 6 C16
## License.. Principal.First.Name Principal.Middle.Name Principal.Last.Name
## 1 743608 KAROLY HIDEGKUTI
## 2 1044737 OSCAR MENDOZA
## 3 0
## 4 955109 TZVIKA MICHAEL DINER
## 5 0
## 6 523116 TAL DAGAN
## License.Expiration.Date Applicant.First.Name Applicant.Last.Name
## 1 12/31/2013
## 2 09/30/2020 OSCAR MENDOZA
## 3 01/01/2009 JASON LEUNG
## 4 11/30/2022 RAZ GRINBAUM
## 5 JOEL TORRES
## 6 Liat Kivity
## Applicant.Business.Name Applicant.Address.1 Applicant.Address.2
## 1
## 2
## 3 OWNER-BUILDER
## 4
## 5
## 6 7651 Densmore avenue
## Applicant.Address.3 Zone Occupancy
## 1 R1-1
## 2 C2-1VL-CPIO
## 3 R1-1
## 4 R3-1
## 5 R1-1
## 6 Van Nuys M2-1
## Floor.Area.L.A..Building.Code.Definition Census.Tract Latitude.Longitude
## 1 0 1433.00 (34.1559, -118.38389)
## 2 0 2323.00 (33.99616, -118.29167)
## 3 0 1835.10 (34.11679, -118.20276)
## 4 0 2091.03 (34.05879, -118.26377)
## 5 NA 2625.01 (34.05319, -118.52005)
## 6 0 2677.00 (34.03666, -118.44346)
skimr::skim(data)
| Name | data |
| Number of rows | 1526025 |
| Number of columns | 54 |
| _______________________ | |
| Column type frequency: | |
| character | 43 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Assessor.Book | 0 | 1.00 | 0 | 4 | 1517 | 1172 | 0 |
| Assessor.Page | 0 | 1.00 | 0 | 3 | 1517 | 117 | 0 |
| Assessor.Parcel | 0 | 1.00 | 0 | 3 | 1517 | 378 | 0 |
| Tract | 0 | 1.00 | 0 | 160 | 5974 | 24362 | 0 |
| Block | 0 | 1.00 | 0 | 6 | 1232583 | 326 | 0 |
| Lot | 0 | 1.00 | 0 | 40 | 10136 | 4435 | 0 |
| Reference….Old.Permit… | 0 | 1.00 | 0 | 9 | 868680 | 459335 | 0 |
| PCIS.Permit.. | 0 | 1.00 | 17 | 17 | 0 | 1524175 | 0 |
| Status | 0 | 1.00 | 6 | 24 | 0 | 32 | 0 |
| Status.Date | 0 | 1.00 | 22 | 22 | 0 | 3475 | 0 |
| Permit.Type | 0 | 1.00 | 4 | 20 | 0 | 18 | 0 |
| Permit.Sub.Type | 0 | 1.00 | 6 | 22 | 0 | 7 | 0 |
| Permit.Category | 0 | 1.00 | 10 | 13 | 0 | 2 | 0 |
| Event.Code | 0 | 1.00 | 0 | 6 | 1526021 | 4 | 0 |
| Initiating.Office | 0 | 1.00 | 3 | 8 | 0 | 7 | 0 |
| Issue.Date | 0 | 1.00 | 10 | 10 | 0 | 3538 | 0 |
| Address.Fraction.Start | 0 | 1.00 | 0 | 3 | 1503645 | 45 | 0 |
| Address.Fraction.End | 0 | 1.00 | 0 | 3 | 1486786 | 64 | 0 |
| Street.Direction | 0 | 1.00 | 0 | 1 | 14492 | 5 | 0 |
| Street.Name | 0 | 1.00 | 0 | 29 | 11 | 7792 | 0 |
| Street.Suffix | 0 | 1.00 | 0 | 4 | 31134 | 41 | 0 |
| Suffix.Direction | 0 | 1.00 | 0 | 5 | 1516970 | 13 | 0 |
| Unit.Range.Start | 2 | 1.00 | 0 | 10 | 1233069 | 44123 | 0 |
| Unit.Range.End | 0 | 1.00 | 0 | 10 | 1493161 | 7528 | 0 |
| Work.Description | 0 | 1.00 | 0 | 70 | 837304 | 467679 | 0 |
| Contractor.s.Business.Name | 0 | 1.00 | 0 | 65 | 5617 | 45543 | 0 |
| Contractor.Address | 0 | 1.00 | 0 | 40 | 212681 | 57571 | 0 |
| Contractor.City | 0 | 1.00 | 0 | 25 | 218474 | 1751 | 0 |
| Contractor.State | 0 | 1.00 | 0 | 2 | 218588 | 60 | 0 |
| License.Type | 205076 | 0.87 | 0 | 3 | 29 | 110 | 0 |
| Principal.First.Name | 0 | 1.00 | 0 | 15 | 197961 | 9879 | 0 |
| Principal.Middle.Name | 4 | 1.00 | 0 | 12 | 598558 | 8118 | 0 |
| Principal.Last.Name | 1 | 1.00 | 0 | 35 | 195049 | 25218 | 0 |
| License.Expiration.Date | 0 | 1.00 | 0 | 10 | 153517 | 951 | 0 |
| Applicant.First.Name | 7 | 1.00 | 0 | 15 | 233528 | 39617 | 0 |
| Applicant.Last.Name | 12 | 1.00 | 0 | 15 | 398825 | 59535 | 0 |
| Applicant.Business.Name | 0 | 1.00 | 0 | 30 | 1394767 | 22790 | 0 |
| Applicant.Address.1 | 16 | 1.00 | 0 | 30 | 652230 | 118491 | 0 |
| Applicant.Address.2 | 1 | 1.00 | 0 | 30 | 1275200 | 7507 | 0 |
| Applicant.Address.3 | 17 | 1.00 | 0 | 30 | 662209 | 8652 | 0 |
| Zone | 0 | 1.00 | 0 | 34 | 3739 | 1919 | 0 |
| Occupancy | 0 | 1.00 | 0 | 16 | 1503145 | 29 | 0 |
| Latitude.Longitude | 0 | 1.00 | 0 | 22 | 80838 | 393225 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Project.Number | 1512410 | 0.01 | 4.40 | 1.41 | 1.0 | 5.00 | 5.00 | 5.00 | 5.00 | ▂▁▁▁▇ |
| Address.Start | 16 | 1.00 | 6666.63 | 6041.17 | 1.0 | 1515.00 | 4921.00 | 10580.00 | 83011.00 | ▇▁▁▁▁ |
| Address.End | 21 | 1.00 | 6667.16 | 6042.74 | 1.0 | 1515.00 | 4921.00 | 10580.00 | 84113.00 | ▇▁▁▁▁ |
| Zip.Code | 94 | 1.00 | 90560.85 | 905.25 | 0.0 | 90033.00 | 90067.00 | 91331.00 | 91803.00 | ▁▁▁▁▇ |
| Valuation | 947162 | 0.38 | 143943.65 | 2802822.29 | 0.0 | 2200.00 | 8000.00 | 30000.00 | 583800300.00 | ▇▁▁▁▁ |
| Floor.Area.L.A..Zoning.Code.Definition | 1394471 | 0.09 | 3622.17 | 32160.31 | -302813.0 | 77.00 | 542.00 | 2105.00 | 5107724.00 | ▇▁▁▁▁ |
| X..of.Residential.Dwelling.Units | 1456737 | 0.05 | 1.79 | 18.97 | -147.0 | 0.00 | 0.00 | 1.00 | 1150.00 | ▇▁▁▁▁ |
| X..of.Stories | 1395910 | 0.09 | 1.55 | 26.91 | -3.0 | 0.00 | 1.00 | 2.00 | 4654.00 | ▇▁▁▁▁ |
| License.. | 29 | 1.00 | 690006.70 | 359607.61 | 0.0 | 505706.00 | 806827.00 | 961891.00 | 88318213.00 | ▇▁▁▁▁ |
| Floor.Area.L.A..Building.Code.Definition | 323167 | 0.79 | 295.32 | 11028.13 | -826056.0 | 0.00 | 0.00 | 0.00 | 5214880.00 | ▇▁▁▁▁ |
| Census.Tract | 1247 | 1.00 | 2045.53 | 1160.25 | 1011.1 | 1341.01 | 1959.02 | 2397.02 | 9800.31 | ▇▁▁▁▁ |
names(data)
## [1] "Assessor.Book"
## [2] "Assessor.Page"
## [3] "Assessor.Parcel"
## [4] "Tract"
## [5] "Block"
## [6] "Lot"
## [7] "Reference....Old.Permit..."
## [8] "PCIS.Permit.."
## [9] "Status"
## [10] "Status.Date"
## [11] "Permit.Type"
## [12] "Permit.Sub.Type"
## [13] "Permit.Category"
## [14] "Project.Number"
## [15] "Event.Code"
## [16] "Initiating.Office"
## [17] "Issue.Date"
## [18] "Address.Start"
## [19] "Address.Fraction.Start"
## [20] "Address.End"
## [21] "Address.Fraction.End"
## [22] "Street.Direction"
## [23] "Street.Name"
## [24] "Street.Suffix"
## [25] "Suffix.Direction"
## [26] "Unit.Range.Start"
## [27] "Unit.Range.End"
## [28] "Zip.Code"
## [29] "Work.Description"
## [30] "Valuation"
## [31] "Floor.Area.L.A..Zoning.Code.Definition"
## [32] "X..of.Residential.Dwelling.Units"
## [33] "X..of.Stories"
## [34] "Contractor.s.Business.Name"
## [35] "Contractor.Address"
## [36] "Contractor.City"
## [37] "Contractor.State"
## [38] "License.Type"
## [39] "License.."
## [40] "Principal.First.Name"
## [41] "Principal.Middle.Name"
## [42] "Principal.Last.Name"
## [43] "License.Expiration.Date"
## [44] "Applicant.First.Name"
## [45] "Applicant.Last.Name"
## [46] "Applicant.Business.Name"
## [47] "Applicant.Address.1"
## [48] "Applicant.Address.2"
## [49] "Applicant.Address.3"
## [50] "Zone"
## [51] "Occupancy"
## [52] "Floor.Area.L.A..Building.Code.Definition"
## [53] "Census.Tract"
## [54] "Latitude.Longitude"
data <- data %>%
janitor::clean_names()
attach(data)
data$status_date <- as.Date(status_date, "%m/%d/%Y")
data$issue_date <- as.Date(issue_date, "%m/%d/%Y")
data$diff_dates <- difftime(dmy(substr(status_date,1,10)) ,dmy(issue_date), units = 'days')
summary(data$diff_dates)
## Length Class Mode
## 1526025 difftime numeric
data %>%
select(matches("permit")) %>%
head()
## reference_old_permit pcis_permit permit_type
## 1 13041-10000-24108 Electrical
## 2 20041-10000-19010 Electrical
## 3 19041-10000-34160 Electrical
## 4 21041-10000-92658 Electrical
## 5 20WL05483 16030-30001-00598 Bldg-Alter/Repair
## 6 20043-10000-04664 Fire Sprinkler
## permit_sub_type permit_category
## 1 1 or 2 Family Dwelling No Plan Check
## 2 Commercial No Plan Check
## 3 1 or 2 Family Dwelling No Plan Check
## 4 Apartment No Plan Check
## 5 1 or 2 Family Dwelling No Plan Check
## 6 Commercial Plan Check
data %>%
count(permit_type)
## permit_type n
## 1 Bldg-Addition 52750
## 2 Bldg-Alter/Repair 360166
## 3 Bldg-Demolition 19966
## 4 Bldg-New 36750
## 5 Bldg-Relocation 13
## 6 Electrical 453402
## 7 Elevator 19237
## 8 Fire Sprinkler 58149
## 9 Grading 30048
## 10 HVAC 144381
## 11 Nonbldg-Addition 259
## 12 Nonbldg-Alter/Repair 7264
## 13 Nonbldg-Demolition 94
## 14 Nonbldg-New 24870
## 15 Plumbing 276037
## 16 Pressure Vessel 2836
## 17 Sign 17849
## 18 Swimming-Pool/Spa 21954
There are 18 unique permit types.
data %>%
count(permit_sub_type)
## permit_sub_type n
## 1 1 or 2 Family Dwelling 875690
## 2 Apartment 269854
## 3 Commercial 352588
## 4 Offsite 354
## 5 Onsite 17495
## 6 Public Safety Only 2753
## 7 Special Equipment 7291
There are 7 unique permit sub-types.
data %>%
count(permit_category)
## permit_category n
## 1 No Plan Check 1016559
## 2 Plan Check 509466
There are 2 unique permit categories.
result1 <- data.frame(data %>%
count(permit_type))
data.frame(table(data$permit_type))%>%
ggplot(aes(Var1,Freq))+
geom_col()+
theme(axis.text.x = element_text(angle = 60,hjust = 1))+
geom_text(aes(label = Freq),size = 3,vjust =-0.5)+
labs(x = "Permit type",y = "Frequency",title = "Histgram of permit type")
data.frame(table(data$permit_sub_type))%>%
ggplot(aes(Var1,Freq))+
geom_col()+
geom_text(aes(label = Freq),size = 3,vjust =-0.5)+
theme(axis.text.x = element_text(angle = 60,hjust = 1))+
labs(x = "permit_sub_type",y = "Frequency",title = "Histgram of permit sub type")
data.frame(table(data$permit_category))%>%
ggplot(aes(Var1,Freq))+
geom_col()+
geom_text(aes(label = Freq),size = 3,vjust =-0.5)+
theme(axis.text.x = element_text(angle = 60,hjust = 1))+
labs(x = 'permit_category',y = "Frequency",title = "Histgram of permit category")
data2 <- data%>%filter(diff_dates >= 0)
data2 <- data2[-which(is.na(data$diff_dates)),]
aggregate(data = data2, diff_dates ~ permit_type + permit_sub_type + permit_category,mean)
## permit_type permit_sub_type permit_category diff_dates
## 1 Bldg-Addition 1 or 2 Family Dwelling No Plan Check 63.00000 days
## 2 Bldg-Alter/Repair 1 or 2 Family Dwelling No Plan Check 163.08475 days
## 3 Bldg-Demolition 1 or 2 Family Dwelling No Plan Check 293.66667 days
## 4 Electrical 1 or 2 Family Dwelling No Plan Check 169.27650 days
## 5 Elevator 1 or 2 Family Dwelling No Plan Check 267.75000 days
## 6 Fire Sprinkler 1 or 2 Family Dwelling No Plan Check 137.32110 days
## 7 Grading 1 or 2 Family Dwelling No Plan Check 401.43750 days
## 8 HVAC 1 or 2 Family Dwelling No Plan Check 162.91245 days
## 9 Nonbldg-Alter/Repair 1 or 2 Family Dwelling No Plan Check 116.93333 days
## 10 Nonbldg-New 1 or 2 Family Dwelling No Plan Check 98.28000 days
## 11 Plumbing 1 or 2 Family Dwelling No Plan Check 148.20603 days
## 12 Pressure Vessel 1 or 2 Family Dwelling No Plan Check 390.00000 days
## 13 Swimming-Pool/Spa 1 or 2 Family Dwelling No Plan Check 221.17391 days
## 14 Bldg-Alter/Repair Apartment No Plan Check 157.57729 days
## 15 Electrical Apartment No Plan Check 152.15316 days
## 16 Elevator Apartment No Plan Check 121.73869 days
## 17 Fire Sprinkler Apartment No Plan Check 185.02778 days
## 18 Grading Apartment No Plan Check 503.40000 days
## 19 HVAC Apartment No Plan Check 170.25237 days
## 20 Nonbldg-Alter/Repair Apartment No Plan Check 135.00000 days
## 21 Nonbldg-New Apartment No Plan Check 77.80000 days
## 22 Plumbing Apartment No Plan Check 173.98746 days
## 23 Pressure Vessel Apartment No Plan Check 135.21622 days
## 24 Swimming-Pool/Spa Apartment No Plan Check 0.00000 days
## 25 Bldg-Alter/Repair Commercial No Plan Check 162.52069 days
## 26 Electrical Commercial No Plan Check 171.72459 days
## 27 Elevator Commercial No Plan Check 132.12937 days
## 28 Fire Sprinkler Commercial No Plan Check 166.75266 days
## 29 Grading Commercial No Plan Check 227.62500 days
## 30 HVAC Commercial No Plan Check 169.77027 days
## 31 Nonbldg-Alter/Repair Commercial No Plan Check 121.10000 days
## 32 Nonbldg-New Commercial No Plan Check 212.88235 days
## 33 Plumbing Commercial No Plan Check 169.94178 days
## 34 Pressure Vessel Commercial No Plan Check 123.41667 days
## 35 Sign Onsite No Plan Check 144.12500 days
## 36 Electrical Public Safety Only No Plan Check 93.94355 days
## 37 Electrical Special Equipment No Plan Check 215.27333 days
## 38 Bldg-Addition 1 or 2 Family Dwelling Plan Check 269.50916 days
## 39 Bldg-Alter/Repair 1 or 2 Family Dwelling Plan Check 210.99249 days
## 40 Bldg-Demolition 1 or 2 Family Dwelling Plan Check 205.23956 days
## 41 Bldg-New 1 or 2 Family Dwelling Plan Check 289.40635 days
## 42 Electrical 1 or 2 Family Dwelling Plan Check 154.77431 days
## 43 Elevator 1 or 2 Family Dwelling Plan Check 291.10417 days
## 44 Fire Sprinkler 1 or 2 Family Dwelling Plan Check 274.57551 days
## 45 Grading 1 or 2 Family Dwelling Plan Check 306.84497 days
## 46 HVAC 1 or 2 Family Dwelling Plan Check 0.00000 days
## 47 Nonbldg-Addition 1 or 2 Family Dwelling Plan Check 178.37500 days
## 48 Nonbldg-Alter/Repair 1 or 2 Family Dwelling Plan Check 237.44776 days
## 49 Nonbldg-New 1 or 2 Family Dwelling Plan Check 264.37220 days
## 50 Plumbing 1 or 2 Family Dwelling Plan Check 401.96552 days
## 51 Swimming-Pool/Spa 1 or 2 Family Dwelling Plan Check 245.53598 days
## 52 Bldg-Addition Apartment Plan Check 352.25581 days
## 53 Bldg-Alter/Repair Apartment Plan Check 225.92746 days
## 54 Bldg-Demolition Apartment Plan Check 292.38636 days
## 55 Bldg-New Apartment Plan Check 395.55556 days
## 56 Electrical Apartment Plan Check 221.45545 days
## 57 Elevator Apartment Plan Check 244.46377 days
## 58 Fire Sprinkler Apartment Plan Check 319.87179 days
## 59 Grading Apartment Plan Check 297.53061 days
## 60 HVAC Apartment Plan Check 374.89655 days
## 61 Nonbldg-Alter/Repair Apartment Plan Check 220.87500 days
## 62 Nonbldg-New Apartment Plan Check 394.99029 days
## 63 Plumbing Apartment Plan Check 288.81166 days
## 64 Swimming-Pool/Spa Apartment Plan Check 244.64706 days
## 65 Bldg-Addition Commercial Plan Check 307.53659 days
## 66 Bldg-Alter/Repair Commercial Plan Check 213.08066 days
## 67 Bldg-Demolition Commercial Plan Check 226.55319 days
## 68 Bldg-New Commercial Plan Check 279.96923 days
## 69 Electrical Commercial Plan Check 183.56192 days
## 70 Elevator Commercial Plan Check 217.42466 days
## 71 Fire Sprinkler Commercial Plan Check 235.33618 days
## 72 Grading Commercial Plan Check 353.51818 days
## 73 HVAC Commercial Plan Check 200.52609 days
## 74 Nonbldg-Addition Commercial Plan Check 0.00000 days
## 75 Nonbldg-Alter/Repair Commercial Plan Check 165.46364 days
## 76 Nonbldg-Demolition Commercial Plan Check 282.16667 days
## 77 Nonbldg-New Commercial Plan Check 290.52083 days
## 78 Plumbing Commercial Plan Check 221.61246 days
## 79 Swimming-Pool/Spa Commercial Plan Check 235.37500 days
## 80 Sign Offsite Plan Check 282.00000 days
## 81 Sign Onsite Plan Check 139.10940 days
## 82 Electrical Public Safety Only Plan Check 303.80000 days
## 83 Electrical Special Equipment Plan Check 0.00000 days
data3 <- data2[,c(1:3,6,11:13,16,22,55)]
data3 <- data3%>%filter(assessor_parcel != "***") # remove "***"
data3$lot <- str_match(data3$lot,'[0-9]') # match value
data3$diff_dates <- as.numeric(data3$diff_dates)
data3$assessor_page <- as.numeric(data3$assessor_page)
data3$assessor_parcel <- as.numeric(data3$assessor_parcel)
data3$lot <- as.numeric(data3$lot)
data3 <- na.omit(data3)
set.seed(1) # set seed
n <- sample(nrow(data3),nrow(data3)*0.7)
train <- data3[n,] # train set
test <- data3[-n,] # test set
rf <- randomForest(diff_dates ~., data = train) #random forest model
p.rf <- predict(rf,test) # predict value
rmse.rf <- sqrt(mean((p.rf - test$diff_dates)^2));rmse.rf # calculate rmse
## [1] 292.7875
train.x <- data.matrix(train[,-10])
train.y <- data.matrix(train$diff_dates)
test.x <- data.matrix(test[,-10])
lm.lasso <- cv.glmnet(train.x,train.y,alpha=1,family = 'gaussian') # modeling
plot(lm.lasso)
p.reg <- predict(lm.lasso,test.x,s=lm.lasso$lambda.min) # predict value, lambda chooses the value with the smallest error
rmse.las.reg <- sqrt(mean((p.reg - test$diff_dates)^2));rmse.las.reg # # calculate rmse
## [1] 293.7309
m3 <- lm(diff_dates ~., data = train)
sr <- step(m3,trace = 0)
p.sr <- predict(sr,test) # predict value
rmse.sr <- sqrt(mean((p.sr - test$diff_dates)^2));rmse.sr# calculate rmse
## [1] 292.4311
cp <- data.frame(model_name = c('random forest model',
'lasso regression',
'stepwise regression'),
RMSE = c(rmse.rf,rmse.las.reg,rmse.sr))
knitr::kable(cp)
| model_name | RMSE |
|---|---|
| random forest model | 292.7875 |
| lasso regression | 293.7309 |
| stepwise regression | 292.4311 |
The difference between the RMSE results obtained by these three methods is not very large, and if we have to choose one, we can see that the stepwise regression results are relatively better.