#Revision

getwd()
## [1] "C:/Users/dell/Desktop/Teaching"
rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 294033  7.9     592000 15.9   362220  9.7
## Vcells 324528  2.5     786432  6.0   677529  5.2
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: i386-w64-mingw32/i386 (32-bit)
## Running under: Windows 7 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
## [1] magrittr_1.5  tools_3.2.2   htmltools_0.3 stringi_1.0-1 rmarkdown_0.7
## [6] knitr_1.10.5  stringr_1.0.0 digest_0.6.8  evaluate_0.7
memory.limit()
## [1] 1535
memory.size()
## [1] 14.39
dir()
##  [1] "adult.data.txt"           "advanced graphs.R"       
##  [3] "advanced_graphs.R"        "advanced_graphs.spin.R"  
##  [5] "advanced_graphs.spin.Rmd" "AUS_adm2.RData"          
##  [7] "basic stats.R"            "basic_stats.html"        
##  [9] "basickmeans.R"            "Boston.csv"              
## [11] "CAN_adm3.RData"           "ccFraud.csv"             
## [13] "data manipulation.R"      "data_manipulation.html"  
## [15] "dataan1.R"                "datainput3.html"         
## [17] "datainput3.R"             "datainput3.spin.R"       
## [19] "datainput3.spin.Rmd"      "datainputcompared.html"  
## [21] "datainputcompared.R"      "datamining.R"            
## [23] "dplyr.html"               "dplyr.R"                 
## [25] "DVD.csv"                  "FRA_adm3.RData"          
## [27] "IND_adm3.RData"           "modules"                 
## [29] "out2.csv"                 "piping.html"             
## [31] "piping.R"                 "plots again.R"           
## [33] "plots_again.html"         "prof3.html"              
## [35] "prof3.R"                  "RegressioninR.html"      
## [37] "RegressioninR.R"          "revision in R.R"         
## [39] "revision_in_R.R"          "revision_in_R.spin.R"    
## [41] "revision_in_R.spin.Rmd"   "rsconnect"               
## [43] "rulesassociation.html"    "rulesassociation.R"      
## [45] "SpatialR.R"               "Teaching.Rproj"          
## [47] "test1.html"               "test1.R"                 
## [49] "textmining.html"          "textmining.R"            
## [51] "timeseriesinR.html"       "timeseriesinR.R"         
## [53] "ts3.html"                 "ts3.R"
dir(pattern = ".csv")
## [1] "Boston.csv"  "ccFraud.csv" "DVD.csv"     "out2.csv"
library(data.table)
boston=fread("Boston.csv")

tables()
##      NAME   NROW NCOL MB
## [1,] boston  506   15  1
##      COLS                                                                 
## [1,] V1,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
##      KEY
## [1,]    
## Total: 1MB
str(boston)
## Classes 'data.table' and 'data.frame':   506 obs. of  15 variables:
##  $ V1     : chr  "1" "2" "3" "4" ...
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>
dim(boston)
## [1] 506  15
nrow(boston)
## [1] 506
ncol(boston)
## [1] 15
names(boston)
##  [1] "V1"      "crim"    "zn"      "indus"   "chas"    "nox"     "rm"     
##  [8] "age"     "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"  
## [15] "medv"
summary(boston)
##       V1                 crim                zn             indus      
##  Length:506         Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46  
##  Class :character   1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19  
##  Mode  :character   Median : 0.25651   Median :  0.00   Median : 9.69  
##                     Mean   : 3.61352   Mean   : 11.36   Mean   :11.14  
##                     3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10  
##                     Max.   :88.97620   Max.   :100.00   Max.   :27.74  
##                                                                        
##       chas              nox               rm             age        
##  Min.   :0.00000   Min.   :0.3850   Min.   :3.561   Min.   :  2.90  
##  1st Qu.:0.00000   1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02  
##  Median :0.00000   Median :0.5380   Median :6.208   Median : 77.50  
##  Mean   :0.06917   Mean   :0.5547   Mean   :6.285   Mean   : 68.57  
##  3rd Qu.:0.00000   3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08  
##  Max.   :1.00000   Max.   :0.8710   Max.   :8.780   Max.   :100.00  
##                                                                     
##       dis              rad              tax           ptratio     
##  Min.   : 1.130   Min.   : 1.000   Min.   :187.0   Min.   :12.60  
##  1st Qu.: 2.100   1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40  
##  Median : 3.207   Median : 5.000   Median :330.0   Median :19.05  
##  Mean   : 3.795   Mean   : 9.549   Mean   :408.2   Mean   :18.46  
##  3rd Qu.: 5.188   3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20  
##  Max.   :12.127   Max.   :24.000   Max.   :711.0   Max.   :22.00  
##                                                                   
##      black            lstat            medv      
##  Min.   :  0.32   Min.   : 1.73   Min.   : 5.00  
##  1st Qu.:375.38   1st Qu.: 6.95   1st Qu.:17.20  
##  Median :391.44   Median :11.36   Median :21.50  
##  Mean   :356.67   Mean   :12.65   Mean   :22.75  
##  3rd Qu.:396.23   3rd Qu.:16.95   3rd Qu.:25.15  
##  Max.   :396.90   Max.   :37.97   Max.   :50.00  
##                                   NA's   :39
boston=na.omit(boston)
table(boston$chas)
## 
##   0   1 
## 432  35
cor(boston$medv,boston$rm)
## [1] 0.6919567
cor(boston$medv,boston$chas)
## [1] 0.1728877
boston[rm>3,mean(medv),chas]
##    chas       V1
## 1:    0 22.28588
## 2:    1 28.44000
boston[rm>4,.(mean(age),mean(medv)),chas]
##    chas       V1       V2
## 1:    0 67.21953 22.27186
## 2:    1 77.50000 28.44000
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:data.table':
## 
##     between, last
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
boston2=select(boston,medv,rm,chas)
boston3=sample_frac(boston,0.1)
#always nice to use comments

hist(boston$medv)

boxplot(boston$medv~boston$chas)

plot(boston$medv,type="l",Main="Boston!")
## Warning in plot.window(...): "Main" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "Main" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Main" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Main" is not
## a graphical parameter
## Warning in box(...): "Main" is not a graphical parameter
## Warning in title(...): "Main" is not a graphical parameter
library(ggplot2)

qplot(boston$medv)### use ? or ??
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

## Google and search from Stack Overflow and Vignettes

a=lm(medv~crim+rm+nox,data=boston)
a
## 
## Call:
## lm(formula = medv ~ crim + rm + nox, data = boston)
## 
## Coefficients:
## (Intercept)         crim           rm          nox  
##    -17.8180      -0.2067       7.7439     -13.3363
summary(a)
## 
## Call:
## lm(formula = medv ~ crim + rm + nox, data = boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.980  -3.260  -0.924   2.173  38.758 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.81804    3.37839  -5.274 2.05e-07 ***
## crim         -0.20667    0.03579  -5.775 1.41e-08 ***
## rm            7.74390    0.42368  18.278  < 2e-16 ***
## nox         -13.33634    2.72049  -4.902 1.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.212 on 463 degrees of freedom
## Multiple R-squared:  0.5645, Adjusted R-squared:  0.5616 
## F-statistic:   200 on 3 and 463 DF,  p-value: < 2.2e-16
library(car)
vif(a)
##     crim       rm      nox 
## 1.226828 1.121819 1.283600
outlierTest(a)
##     rstudent unadjusted p-value Bonferonni p
## 330 6.547274         1.5624e-10   7.2966e-08
## 334 5.477616         7.0938e-08   3.3128e-05
## 333 4.959858         9.9293e-07   4.6370e-04
## 327 4.737641         2.8822e-06   1.3460e-03
## 331 4.212256         3.0409e-05   1.4201e-02
dim(boston)
## [1] 467  15
clus=kmeans(boston[10:15],4)
#?kmeans
clus
## K-means clustering with 4 clusters of sizes 1, 2, 2, 1
## 
## Cluster means:
##     V1     crim zn indus chas   nox     rm   age    dis rad tax ptratio
## 1 49.0 0.253870  0  6.91    0 0.448 5.3990 95.30 5.8700   3 233    17.9
## 2 51.5 0.066050 21  5.64    0 0.439 6.0390 54.35 6.8147   4 243    16.8
## 3 53.5 0.051705 21  5.64    0 0.439 6.2545 21.25 6.8147   4 243    16.8
## 4 50.0 0.219770  0  6.91    0 0.448 5.6020 62.00 6.0877   3 233    17.9
##     black  lstat medv
## 1 396.900 30.810 14.4
## 2 394.765 11.440 20.1
## 3 396.900  6.855 24.2
## 4 396.900 16.200 19.4
## 
## Clustering vector:
## [1] 1 4 2 2 3 3
## 
## Within cluster sum of squares by cluster:
## [1]   0.000000 159.821831   6.917842   0.000000
##  (between_SS / total_SS =  96.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
write.csv(boston,file="out2.csv")


#rattle
#ts

#association analysis
# Refer to http://rpubs.com/ajaydecis
## refer to www.statmethods.net