#Revision

getwd()
## [1] "C:/Users/dell/Desktop/Teaching"
rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 360279  9.7     592000 15.9   460000 12.3
## Vcells 367550  2.9    1023718  7.9   752067  5.8
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: i386-w64-mingw32/i386 (32-bit)
## Running under: Windows 7 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] magrittr_1.5    tools_3.3.1     htmltools_0.3.5 Rcpp_0.12.7    
##  [5] stringi_1.1.1   rmarkdown_1.0   knitr_1.14      stringr_1.1.0  
##  [9] digest_0.6.10   evaluate_0.9
memory.limit()
## [1] 1535
memory.size()
## [1] 17.35
dir()
##  [1] "adult.data.txt"           "advanced graphs.R"       
##  [3] "advanced_graphs.R"        "advanced_graphs.spin.R"  
##  [5] "advanced_graphs.spin.Rmd" "AUS_adm2.RData"          
##  [7] "basic stats.R"            "basic_stats.html"        
##  [9] "basickmeans.R"            "Boston.csv"              
## [11] "CAN_adm3.RData"           "ccFraud.csv"             
## [13] "data manipulation.R"      "data_manipulation.html"  
## [15] "dataan1.R"                "datainput3.html"         
## [17] "datainput3.R"             "datainput3.spin.R"       
## [19] "datainput3.spin.Rmd"      "datainputcompared.html"  
## [21] "datainputcompared.R"      "datamining.R"            
## [23] "dplyr.html"               "dplyr.R"                 
## [25] "DVD.csv"                  "dviz1.R"                 
## [27] "FRA_adm3.RData"           "IND_adm3.RData"          
## [29] "json1.R"                  "menarche logistic.R"     
## [31] "modules"                  "out2.csv"                
## [33] "piping.html"              "piping.R"                
## [35] "plots again.R"            "plots_again.html"        
## [37] "prof3.html"               "prof3.R"                 
## [39] "RegressioninR.html"       "RegressioninR.R"         
## [41] "revision in R.R"          "revision_in_R.html"      
## [43] "revision_in_R.R"          "revision_in_R.spin.R"    
## [45] "revision_in_R.spin.Rmd"   "rsconnect"               
## [47] "rulesassociation.html"    "rulesassociation.R"      
## [49] "screenshots"              "SpatialR.R"              
## [51] "Teaching.Rproj"           "test1.html"              
## [53] "test1.R"                  "textmining.html"         
## [55] "textmining.R"             "timeseries4.R"           
## [57] "timeseriesinR.html"       "timeseriesinR.R"         
## [59] "ts3.html"                 "ts3.R"                   
## [61] "ts5.R"
dir(pattern = ".csv")
## [1] "Boston.csv"  "ccFraud.csv" "DVD.csv"     "out2.csv"
library(data.table)
boston=fread("Boston.csv")

tables()
##      NAME   NROW NCOL MB
## [1,] boston  506   15  1
##      COLS                                                                 
## [1,] V1,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
##      KEY
## [1,]    
## Total: 1MB
str(boston)
## Classes 'data.table' and 'data.frame':   506 obs. of  15 variables:
##  $ V1     : chr  "1" "2" "3" "4" ...
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : int  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>
dim(boston)
## [1] 506  15
nrow(boston)
## [1] 506
ncol(boston)
## [1] 15
names(boston)
##  [1] "V1"      "crim"    "zn"      "indus"   "chas"    "nox"     "rm"     
##  [8] "age"     "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"  
## [15] "medv"
summary(boston)
##       V1                 crim                zn             indus      
##  Length:506         Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46  
##  Class :character   1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19  
##  Mode  :character   Median : 0.25651   Median :  0.00   Median : 9.69  
##                     Mean   : 3.61352   Mean   : 11.36   Mean   :11.14  
##                     3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10  
##                     Max.   :88.97620   Max.   :100.00   Max.   :27.74  
##                                                                        
##       chas              nox               rm             age        
##  Min.   :0.00000   Min.   :0.3850   Min.   :3.561   Min.   :  2.90  
##  1st Qu.:0.00000   1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02  
##  Median :0.00000   Median :0.5380   Median :6.208   Median : 77.50  
##  Mean   :0.06917   Mean   :0.5547   Mean   :6.285   Mean   : 68.57  
##  3rd Qu.:0.00000   3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08  
##  Max.   :1.00000   Max.   :0.8710   Max.   :8.780   Max.   :100.00  
##                                                                     
##       dis              rad              tax           ptratio     
##  Min.   : 1.130   Min.   : 1.000   Min.   :187.0   Min.   :12.60  
##  1st Qu.: 2.100   1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40  
##  Median : 3.207   Median : 5.000   Median :330.0   Median :19.05  
##  Mean   : 3.795   Mean   : 9.549   Mean   :408.2   Mean   :18.46  
##  3rd Qu.: 5.188   3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20  
##  Max.   :12.127   Max.   :24.000   Max.   :711.0   Max.   :22.00  
##                                                                   
##      black            lstat            medv      
##  Min.   :  0.32   Min.   : 1.73   Min.   : 5.00  
##  1st Qu.:375.38   1st Qu.: 6.95   1st Qu.:17.20  
##  Median :391.44   Median :11.36   Median :21.50  
##  Mean   :356.67   Mean   :12.65   Mean   :22.75  
##  3rd Qu.:396.23   3rd Qu.:16.95   3rd Qu.:25.15  
##  Max.   :396.90   Max.   :37.97   Max.   :50.00  
##                                   NA's   :39
boston=na.omit(boston)
table(boston$chas)
## 
##   0   1 
## 432  35
cor(boston$medv,boston$rm)
## [1] 0.6919567
cor(boston$medv,boston$chas)
## [1] 0.1728877
boston[rm>3,mean(medv),chas]
##    chas       V1
## 1:    0 22.28588
## 2:    1 28.44000
boston[rm>4,.(mean(age),mean(medv)),chas]
##    chas       V1       V2
## 1:    0 67.21953 22.27186
## 2:    1 77.50000 28.44000
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
boston2=select(boston,medv,rm,chas)
boston3=sample_frac(boston,0.1)
#always nice to use comments

hist(boston$medv)

boxplot(boston$medv~boston$chas)

plot(boston$medv,type="l",Main="Boston!")
## Warning in plot.window(...): "Main" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "Main" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Main" is not
## a graphical parameter

## Warning in axis(side = side, at = at, labels = labels, ...): "Main" is not
## a graphical parameter
## Warning in box(...): "Main" is not a graphical parameter
## Warning in title(...): "Main" is not a graphical parameter
library(ggplot2)

qplot(boston$medv)### use ? or ??
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Google and search from Stack Overflow and Vignettes

a=lm(medv~crim+rm+nox,data=boston)
a
## 
## Call:
## lm(formula = medv ~ crim + rm + nox, data = boston)
## 
## Coefficients:
## (Intercept)         crim           rm          nox  
##    -17.8180      -0.2067       7.7439     -13.3363
summary(a)
## 
## Call:
## lm(formula = medv ~ crim + rm + nox, data = boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.980  -3.260  -0.924   2.173  38.758 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17.81804    3.37839  -5.274 2.05e-07 ***
## crim         -0.20667    0.03579  -5.775 1.41e-08 ***
## rm            7.74390    0.42368  18.278  < 2e-16 ***
## nox         -13.33634    2.72049  -4.902 1.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.212 on 463 degrees of freedom
## Multiple R-squared:  0.5645, Adjusted R-squared:  0.5616 
## F-statistic:   200 on 3 and 463 DF,  p-value: < 2.2e-16
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
vif(a)
##     crim       rm      nox 
## 1.226828 1.121819 1.283600
outlierTest(a)
##     rstudent unadjusted p-value Bonferonni p
## 330 6.547274         1.5624e-10   7.2966e-08
## 334 5.477616         7.0938e-08   3.3128e-05
## 333 4.959858         9.9293e-07   4.6370e-04
## 327 4.737641         2.8822e-06   1.3460e-03
## 331 4.212256         3.0409e-05   1.4201e-02
dim(boston)
## [1] 467  15
clus=kmeans(boston[10:15],4)
#?kmeans
clus
## K-means clustering with 4 clusters of sizes 2, 1, 1, 2
## 
## Cluster means:
##     V1     crim zn indus chas   nox     rm   age    dis rad tax ptratio
## 1 51.5 0.066050 21  5.64    0 0.439 6.0390 54.35 6.8147   4 243    16.8
## 2 49.0 0.253870  0  6.91    0 0.448 5.3990 95.30 5.8700   3 233    17.9
## 3 50.0 0.219770  0  6.91    0 0.448 5.6020 62.00 6.0877   3 233    17.9
## 4 53.5 0.051705 21  5.64    0 0.439 6.2545 21.25 6.8147   4 243    16.8
##     black  lstat medv
## 1 394.765 11.440 20.1
## 2 396.900 30.810 14.4
## 3 396.900 16.200 19.4
## 4 396.900  6.855 24.2
## 
## Clustering vector:
## [1] 2 3 1 1 4 4
## 
## Within cluster sum of squares by cluster:
## [1] 159.821831   0.000000   0.000000   6.917842
##  (between_SS / total_SS =  96.8 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
write.csv(boston,file="out2.csv")


#rattle
#ts

#association analysis
# Refer to http://rpubs.com/ajaydecis
## refer to www.statmethods.net