data(iris)
class(iris)
## [1] "data.frame"
class(iris)
## [1] "data.frame"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris, 10)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
tail(iris, 10)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 141 6.7 3.1 5.6 2.4 virginica
## 142 6.9 3.1 5.1 2.3 virginica
## 143 5.8 2.7 5.1 1.9 virginica
## 144 6.8 3.2 5.9 2.3 virginica
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
iris[1:3,]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
iris[1:3,1]
## [1] 5.1 4.9 4.7
iris[1:3,"Sepal.Length"]
## [1] 5.1 4.9 4.7
head(iris[,1:2])
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
## 6 5.4 3.9
head(iris$"Sepal.Length")
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
five.Sepal.iris <- iris[1:5, c("Sepal.Length", "Sepal.Width")]
head(five.Sepal.iris)
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
setosa.data <- iris[iris$Species=="setosa",1:5]
head(setosa.data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(sort(iris$Sepal.Length, decreasing = TRUE))
## [1] 7.9 7.7 7.7 7.7 7.7 7.6
head(iris[order(iris$Sepal.Length, decreasing = TRUE),])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 132 7.9 3.8 6.4 2.0 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 106 7.6 3.0 6.6 2.1 virginica
#Pie Chart
table.iris = table(iris$Species)
pie(table.iris)
#Histogram
hist(iris$Sepal.Length)
#Box Plot
boxplot(Petal.Width ~ Species, data = iris)
#Scatter Plot
plot(x=iris$Petal.Length, y=iris$Petal.Width, col=iris$Species)
#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/lvr_prices.csv', 'lvr_prices.csv')
getwd()
## [1] "D:/OS DATA/Desktop"
library(readr)
lvr_prices <- read_csv("lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_integer(),
## land_sqmeter = col_double(),
## trading_ymd = col_date(format = ""),
## finish_ymd = col_date(format = ""),
## building_sqmeter = col_double(),
## room = col_integer(),
## living_room = col_integer(),
## bath = col_integer(),
## total_price = col_integer(),
## price_per_sqmeter = col_double(),
## parking_sqmeter = col_double(),
## parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1282 total_price an integer 6700000000 'lvr_prices.csv' file 2 2243 total_price an integer 3882685600 'lvr_prices.csv' row 3 2244 total_price an integer 3373314400 'lvr_prices.csv' col 4 4629 total_price an integer 3050000000 'lvr_prices.csv' expected 5 5890 total_price an integer 3133800000 'lvr_prices.csv'
## ... ................. ... .......................................................... ........ .......................................................... ...... .......................................................... .... .......................................................... ... .......................................................... ... .......................................................... ........ ..........................................................
## See problems(...) for more details.
#View(lvr_prices)
daan <- lvr_prices[lvr_prices$area == '大安區',]
sum(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 2.79477e+11
mean(as.numeric(daan$total_price), na.rm = TRUE)
## [1] 29798170
zhongshan <- lvr_prices[lvr_prices$area == '中山區',c('address', 'total_price')]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3,]
## # A tibble: 3 x 2
## address total_price
## <chr> <int>
## 1 臺北市中山區建國北路一段138巷1~30號 1850000000
## 2 臺北市中山區南京東路三段1~30號 1400000000
## 3 中山段二小段31~60地號 1084948034
getTopThree <- function(area){
zhongshan <- lvr_prices[lvr_prices$area == area,]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,c('area', 'address', 'total_price')]
return(res[1:3,])
}
getTopThree('大安區')
## # A tibble: 3 x 3
## area address total_price
## <chr> <chr> <int>
## 1 大安區 臺北市大安區羅斯福路三段283巷4弄1~30號 1869781219
## 2 大安區 臺北市大安區忠孝東路四段241~270號 971340000
## 3 大安區 學府段三小段31~60地號 966660000
tapply(lvr_prices$total_price, lvr_prices$area, function(e)mean(e,na.rm=TRUE))
## 士林區 大同區 大安區 中山區 中正區 內湖區 文山區 北投區
## 24139903 18063872 29798170 26708805 30154011 27905514 16953869 20626410
## 松山區 信義區 南港區 萬華區
## 25652125 24725051 25235793 13642289
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e)mean(e,na.rm=TRUE))
barplot(sort(price_per_sec, decreasing = TRUE), main= "各區平均價", xlab = "區域", ylab = "價格", col="blue")
boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)")
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn
## 使用 dplyr 分析資料
# install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#help(package='dplyr')
filter(lvr_prices, area == '中山區') %>% head()
## # A tibble: 6 x 29
## X1 area trading_target address
## <int> <chr> <chr> <chr>
## 1 13 中山區 房地(土地+建物) 臺北市中山區合江街31~60號
## 2 14 中山區 房地(土地+建物) 臺北市中山區中山北路二段183巷1~30號
## 3 16 中山區 房地(土地+建物) 臺北市中山區吉林路361~390號
## 4 17 中山區 土地 長安段三小段271~300地號
## 5 24 中山區 房地(土地+建物) 臺北市中山區林森北路485巷1~30號
## 6 39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## # non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## # trading_num <chr>, floor <chr>, total_floor <chr>,
## # building_type <chr>, main_purpose <chr>, built_with <chr>,
## # finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## # living_room <int>, bath <int>, compartment <chr>, management <chr>,
## # total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## # parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## # numbers <chr>
select(lvr_prices, total_price) %>% head()
## # A tibble: 6 x 1
## total_price
## <int>
## 1 18680000
## 2 20300000
## 3 132096
## 4 4200000
## 5 14000000
## 6 255000
lvr_prices %>%
select(area, total_price) %>%
filter(area == '中山區') %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 5960000
## 2 中山區 20200000
## 3 中山區 4050000
## 4 中山區 1900000
## 5 中山區 14800000
## 6 中山區 10200000
lvr_prices %>%
select(area, total_price) %>%
filter(area == '中山區') %>%
arrange(total_price) %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 0
## 2 中山區 0
## 3 中山區 10860
## 4 中山區 16000
## 5 中山區 18060
## 6 中山區 21244
lvr_prices %>%
select(area, total_price) %>%
filter(area == '中山區') %>%
arrange(desc(total_price)) %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 1850000000
## 2 中山區 1400000000
## 3 中山區 1084948034
## 4 中山區 1011136500
## 5 中山區 952875000
## 6 中山區 903865500
lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))
lvr_stat <- lvr_prices %>%
select(trading_ym, area, total_price) %>%
filter(trading_ym >= '2012-01-01') %>%
group_by(trading_ym, area) %>%
summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))
lvr_stat$area <- as.factor(lvr_stat$area)
par(mfrow=c(3,4))
for (a in levels(lvr_stat$area)){
plot(overall_price ~ trading_ym
,lvr_stat[lvr_stat$area == a,]
, type='l', main = a)
}
library(tidyr)
price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill=0)
View(price_pivot)
write.csv(price_pivot, 'taipei_house_price.csv')
download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', 'Training50.csv')
library(rpart)
trainset <- read.csv('Training50.csv')
trainset$X <- NULL
trainset$Creditability = as.factor(trainset$Creditability)
head(trainset)
## Creditability Account.Balance Duration.of.Credit..month.
## 1 1 3 6
## 2 0 1 15
## 3 0 1 42
## 4 0 3 36
## 5 1 3 24
## 6 1 1 15
## Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1 2 3 2108
## 2 1 4 950
## 3 2 3 7174
## 4 3 4 7980
## 5 3 2 2028
## 6 2 4 2511
## Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1 1 3 2
## 2 1 4 4
## 3 4 3 4
## 4 4 1 4
## 5 1 3 2
## 6 1 1 1
## Sex...Marital.Status Guarantors Duration.in.Current.address
## 1 3 1 2
## 2 2 1 3
## 3 1 1 3
## 4 2 1 4
## 5 2 1 2
## 6 1 1 4
## Most.valuable.available.asset Age..years. Concurrent.Credits
## 1 1 29 2
## 2 3 33 2
## 3 3 30 2
## 4 3 27 2
## 5 2 30 2
## 6 3 23 2
## Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1 1 1 1 1
## 2 1 2 1 2
## 3 2 1 1 1
## 4 1 2 1 1
## 5 2 2 1 1
## 6 1 1 1 1
## Telephone Foreign.Worker
## 1 1 1
## 2 1 1
## 3 2 1
## 4 2 1
## 5 1 1
## 6 1 1
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
model <- rpart(Creditability ~ ., data=trainset, method = 'class')
#summary(model)
plot(model, margin = 0.1)
text(model, pretty=0,cex=0.6)
download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', 'Test50.csv')
testset <- read.csv('Test50.csv')
testset$X <- NULL
testset$Creditability = as.factor(testset$Creditability)
predicted <- predict(model, testset, type = "class")
table(predicted, testset$Creditability)
##
## predicted 0 1
## 0 64 52
## 1 93 291
#install.packages('caret')
#install.packages('e1071')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
prediction <- predict(model, testset, type = "prob")
roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0,1,0.01)){
res <- as.factor(ifelse(prediction[,1] >= i, 0, 1))
tb <- table(testset$Creditability, res)
if (ncol(tb) == 2){
cm <- confusionMatrix(tb)
x <- 1 - cm$byClass[2]
y <- cm$byClass[1]
roc_x <- c(roc_x, x)
roc_y <- c(roc_y, y)
}
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')
#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2]
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))
## 使用隨機森林 (Random Forest)
library(randomForest)
forest <- randomForest(Creditability ~., data = trainset, ntree=200, importance=T, proximity=T)
forest.predicted <- predict(forest, testset, type = "class")
table(forest.predicted, testset$Creditability)
##
## forest.predicted 0 1
## 0 53 25
## 1 104 318
(316 + 53) / (316+53+27+104)
## [1] 0.738
predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)
## 分群問題
#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')
customers <- read.csv('customers.csv')
head(customers)
## CustomerID Genre Age Annual_Income Spending_Score
## 1 1 Male 19 15 39
## 2 2 Male 21 15 81
## 3 3 Female 20 16 6
## 4 4 Female 23 16 77
## 5 5 Female 31 17 40
## 6 6 Female 22 17 76
customers <- customers[,c('Annual_Income', 'Spending_Score')]
set.seed(123)
kc <- kmeans(customers, centers = 5)
kc
## K-means clustering with 5 clusters of sizes 50, 27, 74, 39, 10
##
## Cluster means:
## Annual_Income Spending_Score
## 1 27.40000 49.48000
## 2 79.00000 16.59259
## 3 55.90541 49.93243
## 4 86.53846 82.12821
## 5 109.70000 22.00000
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## [141] 2 4 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [176] 4 2 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
##
## Within cluster sum of squares by cluster:
## [1] 48174.480 4062.519 7375.000 13444.051 2458.100
## (between_SS / total_SS = 72.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(customers$Annual_Income, customers$Spending_Score, col=kc$cluster)
points(kc$centers[,1], kc$centers[,2], col='yellow')
library(cluster)
kcs = silhouette(kc$cluster,dist(customers))
summary(kcs)
## Silhouette of 200 units in 5 clusters from silhouette.default(x = kc$cluster, dist = dist(customers)) :
## Cluster sizes and average silhouette widths:
## 50 27 74 39 10
## 0.04103987 0.47253618 0.63162169 0.48851396 0.35704613
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.4401 0.2089 0.5292 0.4209 0.6421 0.7556
plot(kcs)
library(fpc)
nk <- 2:10
set.seed(123)
SW <- sapply(nk, function(k) {
cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")