title: “gg2” output: html_document

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v stringr 1.4.0
## v tidyr   1.1.2     v forcats 0.5.0
## v readr   1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(htmlwidgets)

1 корректируем лишние пробелы в обозначении студий

sFolder = "C:/Users/AlexBer127/"
sInFile = "Downloads/SPb_dwellings_for_rent_EMLS_sample_8.csv"
paste(sFolder, sInFile, sep="")
## [1] "C:/Users/AlexBer127/Downloads/SPb_dwellings_for_rent_EMLS_sample_8.csv"
mytable = read.csv(paste(sFolder, sInFile, sep=""), dec = ",", sep = ";")
view(mytable)
mytable$Rooms[mytable$Rooms == "1   (studiya)"] <- "1 (studiya)"
mytable$Rooms[mytable$Rooms == ""] <- "no answer"

табличка с частотой разных значений

rooms <- table(mytable$Rooms)
rooms <- data.frame(rooms)
names(rooms)[names(rooms) == "Var1"] <- "количество комнат"
names(rooms)[names(rooms) == "Freq"] <- "частота"
view(rooms)

2 заменим значения колонки агентов на 1 (да) и 0 (нет)

mytable$No_agents[mytable$No_agents == ""] <- "0" 

табличка с частотой агентов

agents <- table(mytable$No_agents)
agents <-data.frame(agents)
agents$Var1 <- sapply(agents$Var1, as.character)
agents$Var1[agents$Var1 == 0] <- "нет"
agents$Var1[agents$Var1 == 1] <- "да"
names(agents)[names(agents) == "Var1"] <- "через посредника"
names(agents)[names(agents) == "Freq"] <- "частота"
view(agents)

3 работаем с минимальным сроком аренды

mytable$Minimum_duration <- str_replace_all(mytable$Minimum_duration, " mes.", "")
mytable$Minimum_duration <- sapply(mytable$Minimum_duration, as.numeric)
mytable$Minimum_duration [is.na(mytable$Minimum_duration)] <- 0

табличка частоты значений

min_dur <- table(mytable$Minimum_duration)
min_dur <- data.frame(min_dur)
names(min_dur)[names(min_dur) == "Var1"] <- "месяцев"
names(min_dur)[names(min_dur) == "Freq"] <- "частота"
view(min_dur)

4 замены в колонке “Лифт”: “Есть”=1, “Нет”=0

mytable$Lift <- str_replace_all(mytable$Lift, "Est'", "1")
mytable$Lift <- str_replace_all(mytable$Lift, "Net", "0")
mytable$Lift[mytable$Lift == ""] <- "no answer"

табличка частоты

lift <- table(mytable$Lift)
lift <- data.frame(lift)
view(lift)

тут чет не работает переименование ячеек

names(lift)[names(lift) == "Var1"] <- "наличие лифта"
names(lift)[names(lift) == "Freq"] <- "частота"

5 чистим ошибки

mytable$NFloor <- str_replace_all(mytable$NFloor, ".{0,5}[a-z].[a-z].{1,7}", "0")
mytable$NFloor[mytable$NFloor == ""] <- "0"
mytable$NFloor <- sapply(mytable$NFloor, as.numeric)

исправляем некорректные данные цен

mytable[2388,8] <- 20000
mytable[88,8] <- 16000
mytable[838, 8] <- 18000
mytable[1283, 8] <- 19000
mytable[2840, 8] <- 30000

6 чистим NA в остальных переменных

mytable$Metro[mytable$Metro == ""] <- "no answer"
mytable$Dist_metro_ad[is.na(mytable$Dist_metro_ad)] <- 0
mytable$Building[mytable$Building == ""] <- "no answer"
mytable$Area_total[is.na(mytable$Area_total)] <- 0.0 
mytable$Area_living[is.na(mytable$Area_living)] <- 0.0
mytable$Area_kitchen[is.na(mytable$Area_kitchen)] <- 0.0
mytable$Floor[is.na(mytable$Floor)] <- 0
mytable$Furnished[mytable$Furnished == ""] <- "no answer"
mytable$Bath[mytable$Bath == ""] <- "no answer"
mytable$Refurbished[mytable$Refurbished == ""] <- "no answer"
mytable$Furnished[mytable$Furnished == ""] <- "no answer"
mytable$Bath[mytable$Bath == ""] <- "no answer"
mytable$Balcony[mytable$Balcony == ""] <- "no answer"
mytable$Year_construction[is.na(mytable$Year_construction)] <- "no answer"

округление площадей

mytable$Area_kitchen <- round(mytable$Area_kitchen, digits = 1)
mytable$Area_living <- round(mytable$Area_living, digits = 1)

исправляем разно-написанные одинаковые значения в колонке балконов

mytable$Balcony[mytable$Balcony == "Net"] <- "net"
mytable$Balcony[mytable$Balcony == "est'"] <- "Balkon"

исправляем пустые значения широты и долготы

mytable$Longitude[is.na(mytable$Longitude)] <- 0.0
mytable$Latitude[is.na(mytable$Latitude)] <- 0.0

исправляем тупые заполнения полей “тип ванной” и “тип ремонта”

mytable$Bath[mytable$Refurbished == "Dush"] <- "Dush"
mytable$Bath[mytable$Refurbished == "Na kuhne"] <- "Na kuhne"
mytable$Bath[mytable$Refurbished == "Otdel'naya"] <- "Otdel'naya"
mytable$Bath[mytable$Refurbished == "Sidyachaya"] <- "Sidyachaya"
mytable$Bath[mytable$Refurbished == "Sovmeshchennaya"] <- "Sovmeshchennaya"

mytable$Refurbished[mytable$Refurbished == "Dush"] <- "wrong data"
mytable$Refurbished[mytable$Refurbished == "Na kuhne"] <- "wrong data"
mytable$Refurbished[mytable$Refurbished == "Otdel'naya"] <- "wrong data"
mytable$Refurbished[mytable$Refurbished == "Sidyachaya"] <- "wrong data"
mytable$Refurbished[mytable$Refurbished == "Sovmeshchennaya"] <- "wrong data"

удаляем наблюдения, цена в которых не соответствует общей тенденции

mytable <- mytable[-c(416, 1059, 2359,2459,745,2076,2154,2178,193,1300,536,2529,1415,564,711),]