#Revision
getwd()
## [1] "C:/Users/dell/Desktop/Teaching"
rm(list=ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 360279 9.7 592000 15.9 460000 12.3
## Vcells 367550 2.9 1023718 7.9 752067 5.8
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: i386-w64-mingw32/i386 (32-bit)
## Running under: Windows 7 (build 7601) Service Pack 1
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] magrittr_1.5 tools_3.3.1 htmltools_0.3.5 Rcpp_0.12.7
## [5] stringi_1.1.1 rmarkdown_1.0 knitr_1.14 stringr_1.1.0
## [9] digest_0.6.10 evaluate_0.9
memory.limit()
## [1] 1535
memory.size()
## [1] 17.35
dir()
## [1] "adult.data.txt" "advanced graphs.R"
## [3] "advanced_graphs.R" "advanced_graphs.spin.R"
## [5] "advanced_graphs.spin.Rmd" "AUS_adm2.RData"
## [7] "basic stats.R" "basic_stats.html"
## [9] "basickmeans.R" "Boston.csv"
## [11] "CAN_adm3.RData" "ccFraud.csv"
## [13] "data manipulation.R" "data_manipulation.html"
## [15] "dataan1.R" "datainput3.html"
## [17] "datainput3.R" "datainput3.spin.R"
## [19] "datainput3.spin.Rmd" "datainputcompared.html"
## [21] "datainputcompared.R" "datamining.R"
## [23] "dplyr.html" "dplyr.R"
## [25] "DVD.csv" "dviz1.R"
## [27] "FRA_adm3.RData" "IND_adm3.RData"
## [29] "json1.R" "menarche logistic.R"
## [31] "modules" "out2.csv"
## [33] "piping.html" "piping.R"
## [35] "plots again.R" "plots_again.html"
## [37] "prof3.html" "prof3.R"
## [39] "RegressioninR.html" "RegressioninR.R"
## [41] "revision in R.R" "revision_in_R.html"
## [43] "revision_in_R.R" "revision_in_R.spin.R"
## [45] "revision_in_R.spin.Rmd" "rsconnect"
## [47] "rulesassociation.html" "rulesassociation.R"
## [49] "screenshots" "SpatialR.R"
## [51] "Teaching.Rproj" "test1.html"
## [53] "test1.R" "textmining.html"
## [55] "textmining.R" "timeseries4.R"
## [57] "timeseriesinR.html" "timeseriesinR.R"
## [59] "ts3.html" "ts3.R"
## [61] "ts5.R"
dir(pattern = ".csv")
## [1] "Boston.csv" "ccFraud.csv" "DVD.csv" "out2.csv"
library(data.table)
boston=fread("Boston.csv")
tables()
## NAME NROW NCOL MB
## [1,] boston 506 15 1
## COLS
## [1,] V1,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
## KEY
## [1,]
## Total: 1MB
str(boston)
## Classes 'data.table' and 'data.frame': 506 obs. of 15 variables:
## $ V1 : chr "1" "2" "3" "4" ...
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : int 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 NA NA NA NA NA NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
dim(boston)
## [1] 506 15
nrow(boston)
## [1] 506
ncol(boston)
## [1] 15
names(boston)
## [1] "V1" "crim" "zn" "indus" "chas" "nox" "rm"
## [8] "age" "dis" "rad" "tax" "ptratio" "black" "lstat"
## [15] "medv"
summary(boston)
## V1 crim zn indus
## Length:506 Min. : 0.00632 Min. : 0.00 Min. : 0.46
## Class :character 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19
## Mode :character Median : 0.25651 Median : 0.00 Median : 9.69
## Mean : 3.61352 Mean : 11.36 Mean :11.14
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10
## Max. :88.97620 Max. :100.00 Max. :27.74
##
## chas nox rm age
## Min. :0.00000 Min. :0.3850 Min. :3.561 Min. : 2.90
## 1st Qu.:0.00000 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02
## Median :0.00000 Median :0.5380 Median :6.208 Median : 77.50
## Mean :0.06917 Mean :0.5547 Mean :6.285 Mean : 68.57
## 3rd Qu.:0.00000 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08
## Max. :1.00000 Max. :0.8710 Max. :8.780 Max. :100.00
##
## dis rad tax ptratio
## Min. : 1.130 Min. : 1.000 Min. :187.0 Min. :12.60
## 1st Qu.: 2.100 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40
## Median : 3.207 Median : 5.000 Median :330.0 Median :19.05
## Mean : 3.795 Mean : 9.549 Mean :408.2 Mean :18.46
## 3rd Qu.: 5.188 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20
## Max. :12.127 Max. :24.000 Max. :711.0 Max. :22.00
##
## black lstat medv
## Min. : 0.32 Min. : 1.73 Min. : 5.00
## 1st Qu.:375.38 1st Qu.: 6.95 1st Qu.:17.20
## Median :391.44 Median :11.36 Median :21.50
## Mean :356.67 Mean :12.65 Mean :22.75
## 3rd Qu.:396.23 3rd Qu.:16.95 3rd Qu.:25.15
## Max. :396.90 Max. :37.97 Max. :50.00
## NA's :39
boston=na.omit(boston)
table(boston$chas)
##
## 0 1
## 432 35
cor(boston$medv,boston$rm)
## [1] 0.6919567
cor(boston$medv,boston$chas)
## [1] 0.1728877
boston[rm>3,mean(medv),chas]
## chas V1
## 1: 0 22.28588
## 2: 1 28.44000
boston[rm>4,.(mean(age),mean(medv)),chas]
## chas V1 V2
## 1: 0 67.21953 22.27186
## 2: 1 77.50000 28.44000
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
boston2=select(boston,medv,rm,chas)
boston3=sample_frac(boston,0.1)
#always nice to use comments
hist(boston$medv)

boxplot(boston$medv~boston$chas)

plot(boston$medv,type="l",Main="Boston!")
## Warning in plot.window(...): "Main" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "Main" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Main" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Main" is not
## a graphical parameter
## Warning in box(...): "Main" is not a graphical parameter
## Warning in title(...): "Main" is not a graphical parameter
library(ggplot2)

qplot(boston$medv)### use ? or ??
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Google and search from Stack Overflow and Vignettes
a=lm(medv~crim+rm+nox,data=boston)
a
##
## Call:
## lm(formula = medv ~ crim + rm + nox, data = boston)
##
## Coefficients:
## (Intercept) crim rm nox
## -17.8180 -0.2067 7.7439 -13.3363
summary(a)
##
## Call:
## lm(formula = medv ~ crim + rm + nox, data = boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.980 -3.260 -0.924 2.173 38.758
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.81804 3.37839 -5.274 2.05e-07 ***
## crim -0.20667 0.03579 -5.775 1.41e-08 ***
## rm 7.74390 0.42368 18.278 < 2e-16 ***
## nox -13.33634 2.72049 -4.902 1.31e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.212 on 463 degrees of freedom
## Multiple R-squared: 0.5645, Adjusted R-squared: 0.5616
## F-statistic: 200 on 3 and 463 DF, p-value: < 2.2e-16
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(a)
## crim rm nox
## 1.226828 1.121819 1.283600
outlierTest(a)
## rstudent unadjusted p-value Bonferonni p
## 330 6.547274 1.5624e-10 7.2966e-08
## 334 5.477616 7.0938e-08 3.3128e-05
## 333 4.959858 9.9293e-07 4.6370e-04
## 327 4.737641 2.8822e-06 1.3460e-03
## 331 4.212256 3.0409e-05 1.4201e-02
dim(boston)
## [1] 467 15
clus=kmeans(boston[10:15],4)
#?kmeans
clus
## K-means clustering with 4 clusters of sizes 2, 1, 1, 2
##
## Cluster means:
## V1 crim zn indus chas nox rm age dis rad tax ptratio
## 1 51.5 0.066050 21 5.64 0 0.439 6.0390 54.35 6.8147 4 243 16.8
## 2 49.0 0.253870 0 6.91 0 0.448 5.3990 95.30 5.8700 3 233 17.9
## 3 50.0 0.219770 0 6.91 0 0.448 5.6020 62.00 6.0877 3 233 17.9
## 4 53.5 0.051705 21 5.64 0 0.439 6.2545 21.25 6.8147 4 243 16.8
## black lstat medv
## 1 394.765 11.440 20.1
## 2 396.900 30.810 14.4
## 3 396.900 16.200 19.4
## 4 396.900 6.855 24.2
##
## Clustering vector:
## [1] 2 3 1 1 4 4
##
## Within cluster sum of squares by cluster:
## [1] 159.821831 0.000000 0.000000 6.917842
## (between_SS / total_SS = 96.8 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
write.csv(boston,file="out2.csv")
#rattle
#ts
#association analysis
# Refer to http://rpubs.com/ajaydecis
## refer to www.statmethods.net