#install.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
DF <- fread("example_coffee.csv",header=T,stringsAsFactors = T,data.table = F)
str(DF)
## 'data.frame':    46832 obs. of  23 variables:
##  $ number              : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ companyName         : Factor w/ 36991 levels "#11(Sharp eleven)",..: 1 3 4 5 6 7 8 9 10 11 ...
##  $ adress              : Factor w/ 45170 levels "","강원도 강릉시 강동면 안인진리 3-5번지 통일공원 G동 2층",..: 32672 44985 14319 25658 35239 44592 44032 16782 4036 16970 ...
##  $ adressBystreet      : Factor w/ 38866 levels "","강원도 강릉시 가작로 13, 1층 (교동)",..: 28058 38633 12180 22436 1 38386 37843 14490 3373 14482 ...
##  $ dateOflicensing     : int  20080917 20101124 20130902 20121108 20020911 20130822 20140605 20111209 20130315 20110908 ...
##  $ stateOfbusiness     : Factor w/ 2 levels "운영중","폐업 등": 1 1 1 1 2 1 1 1 1 1 ...
##  $ dateOfclosure       : int  NA NA NA NA 20071105 NA NA NA NA NA ...
##  $ startdateOfcessation: logi  NA NA NA NA NA NA ...
##  $ duedateOfcessation  : logi  NA NA NA NA NA NA ...
##  $ dateOfreOpen        : logi  NA NA NA NA NA NA ...
##  $ areaOfsite          : logi  NA NA NA NA NA NA ...
##  $ zip                 : logi  NA NA NA NA NA NA ...
##  $ waterwork           : Factor w/ 5 levels "","간이상수도",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ numOfmenWorker      : int  NA NA NA NA 0 NA NA NA NA NA ...
##  $ yearOfStart         : int  2008 2010 2013 2012 2002 2013 2014 2011 2013 2011 ...
##  $ multipleUse         : Factor w/ 3 levels "","N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ grade               : Factor w/ 8 levels "","갑","관리",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sizeOfsite          : num  20.8 212.7 20 64.2 11.4 ...
##  $ numOfwomenWorker    : int  NA NA NA NA 0 NA NA NA NA NA ...
##  $ vicintyOfsite       : Factor w/ 8 levels "","결혼예식장주변",..: 1 1 4 1 1 1 1 3 1 3 ...
##  $ sanitaryName        : Factor w/ 2 levels "","휴게음식점": 2 2 2 2 2 2 2 2 2 2 ...
##  $ businessCondition   : Factor w/ 2 levels "","커피숍": 2 2 2 2 2 2 2 2 2 2 ...
##  $ totalOfworker       : int  NA NA NA NA 0 NA NA NA NA NA ...
DF = subset(DF,select=c(-adress,-adressBystreet,-dateOfclosure, -startdateOfcessation,-duedateOfcessation,-dateOfreOpen,-zip))
str(DF)
## 'data.frame':    46832 obs. of  16 variables:
##  $ number           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ companyName      : Factor w/ 36991 levels "#11(Sharp eleven)",..: 1 3 4 5 6 7 8 9 10 11 ...
##  $ dateOflicensing  : int  20080917 20101124 20130902 20121108 20020911 20130822 20140605 20111209 20130315 20110908 ...
##  $ stateOfbusiness  : Factor w/ 2 levels "운영중","폐업 등": 1 1 1 1 2 1 1 1 1 1 ...
##  $ areaOfsite       : logi  NA NA NA NA NA NA ...
##  $ waterwork        : Factor w/ 5 levels "","간이상수도",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ numOfmenWorker   : int  NA NA NA NA 0 NA NA NA NA NA ...
##  $ yearOfStart      : int  2008 2010 2013 2012 2002 2013 2014 2011 2013 2011 ...
##  $ multipleUse      : Factor w/ 3 levels "","N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ grade            : Factor w/ 8 levels "","갑","관리",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ sizeOfsite       : num  20.8 212.7 20 64.2 11.4 ...
##  $ numOfwomenWorker : int  NA NA NA NA 0 NA NA NA NA NA ...
##  $ vicintyOfsite    : Factor w/ 8 levels "","결혼예식장주변",..: 1 1 4 1 1 1 1 3 1 3 ...
##  $ sanitaryName     : Factor w/ 2 levels "","휴게음식점": 2 2 2 2 2 2 2 2 2 2 ...
##  $ businessCondition: Factor w/ 2 levels "","커피숍": 2 2 2 2 2 2 2 2 2 2 ...
##  $ totalOfworker    : int  NA NA NA NA 0 NA NA NA NA NA ...
range(DF$yearOfStart, na.rm = T)
## [1] 1964 2015
#na.rm 은 결측치 처리를 위해 반드시 넣어야 range가 동작함. 

subset(DF,subset=(yearOfStart==1964))
##       number companyName dateOflicensing stateOfbusiness areaOfsite
## 23035  23035  엠에스커피        19641125         폐업 등         NA
## 46290  46290        홀릭        19640929         폐업 등         NA
##        waterwork numOfmenWorker yearOfStart multipleUse grade sizeOfsite
## 23035 상수도전용              0        1964           N                0
## 46290                         0        1964           N  기타          0
##       numOfwomenWorker vicintyOfsite sanitaryName businessCondition
## 23035                0          기타   휴게음식점            커피숍
## 46290                0          기타   휴게음식점            커피숍
##       totalOfworker
## 23035             0
## 46290             0
DFFilter = subset(DF,subset = (stateOfbusiness=="운영중"))
range(DFFilter$yearOfStart,na.rm=T)
## [1] 1967 2015
?subset
## starting httpd help server ...
##  done
subset(DFFilter,subset=(yearOfStart==1967 ))
##       number companyName dateOflicensing stateOfbusiness areaOfsite
## 24108  24108 왕관 커피숍        19671013          운영중         NA
## 44934  44934    학커피숍        19670414          운영중         NA
##        waterwork numOfmenWorker yearOfStart multipleUse grade sizeOfsite
## 24108 상수도전용              0        1967           N    갑      76.02
## 44934                         0        1967           N            59.13
##       numOfwomenWorker vicintyOfsite sanitaryName businessCondition
## 24108                1          기타   휴게음식점            커피숍
## 44934                0                 휴게음식점            커피숍
##       totalOfworker
## 24108             1
## 44934             0
table(DF$yearOfStart)
## 
## 1964 1966 1967 1968 1969 1970 1971 1972 1974 1975 1976 1979 1980 1981 1982 
##    2    2    3    1    2    4    6    3    1    2    5    4    9    8   12 
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 
##    9   11   18   21   21   26   23   25   28   37   50   48   48   41   54 
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 
##   54   46   89  183  398  799  648  654  863 1233 1579 2489 4172 5942 6315 
## 2013 2014 2015 
## 7270 9905 3650
qplot(yearOfStart, data=DF, geom = "bar")
## Warning: Removed 19 rows containing non-finite values (stat_count).

ggplot(data = DF , aes(x=yearOfStart) ) + geom_bar(binwidth=1,size=0,color="red") 
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

#위 두개 플롯은 같은 표시임.

Freq = table(DF$stateOfbusiness,DF$yearOfStart)
Freq
##          
##           1964 1966 1967 1968 1969 1970 1971 1972 1974 1975 1976 1979 1980
##   운영중     0    0    2    0    0    2    4    2    1    1    1    2    3
##   폐업 등    2    2    1    1    2    2    2    1    0    1    4    2    6
##          
##           1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
##   운영중     6    2    3    4    5    5    6   11    5    7    7    3   14
##   폐업 등    2   10    6    7   13   16   15   15   18   18   21   34   36
##          
##           1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
##   운영중    14   13   14   21   25   23   26   76  105  163  180  204  314
##   폐업 등   34   35   27   33   29   23   63  107  293  636  468  450  549
##          
##           2007 2008 2009 2010 2011 2012 2013 2014 2015
##   운영중   496  729 1229 2503 3961 4642 6045 9125 3564
##   폐업 등  737  850 1260 1669 1981 1673 1225  780   86
which(colnames(Freq) =="1997")
## [1] 30
which.max(colnames(Freq))
## [1] 48
Freq = Freq[,c(30:47)]

PFreq = prop.table(Freq,margin=2)
PFreq
##          
##                 1997       1998       1999       2000       2001
##   운영중  0.38888889 0.46296296 0.50000000 0.29213483 0.41530055
##   폐업 등 0.61111111 0.53703704 0.50000000 0.70786517 0.58469945
##          
##                 2002       2003       2004       2005       2006
##   운영중  0.26381910 0.20400501 0.27777778 0.31192661 0.36384705
##   폐업 등 0.73618090 0.79599499 0.72222222 0.68807339 0.63615295
##          
##                 2007       2008       2009       2010       2011
##   운영중  0.40227088 0.46168461 0.49377260 0.59995206 0.66661057
##   폐업 등 0.59772912 0.53831539 0.50622740 0.40004794 0.33338943
##          
##                 2012       2013       2014
##   운영중  0.73507522 0.83149931 0.92125189
##   폐업 등 0.26492478 0.16850069 0.07874811
NewDF = data.frame(colnames(Freq), Freq[1,],Freq[2,],PFreq[1,],PFreq[2,])
NewDF
##      colnames.Freq. Freq.1... Freq.2... PFreq.1... PFreq.2...
## 1997           1997        21        33  0.3888889 0.61111111
## 1998           1998        25        29  0.4629630 0.53703704
## 1999           1999        23        23  0.5000000 0.50000000
## 2000           2000        26        63  0.2921348 0.70786517
## 2001           2001        76       107  0.4153005 0.58469945
## 2002           2002       105       293  0.2638191 0.73618090
## 2003           2003       163       636  0.2040050 0.79599499
## 2004           2004       180       468  0.2777778 0.72222222
## 2005           2005       204       450  0.3119266 0.68807339
## 2006           2006       314       549  0.3638470 0.63615295
## 2007           2007       496       737  0.4022709 0.59772912
## 2008           2008       729       850  0.4616846 0.53831539
## 2009           2009      1229      1260  0.4937726 0.50622740
## 2010           2010      2503      1669  0.5999521 0.40004794
## 2011           2011      3961      1981  0.6666106 0.33338943
## 2012           2012      4642      1673  0.7350752 0.26492478
## 2013           2013      6045      1225  0.8314993 0.16850069
## 2014           2014      9125       780  0.9212519 0.07874811
rownames(NewDF) = NULL
colnames(NewDF) = c("Time","Open","Close","POpen","PClose")
NewDF
##    Time Open Close     POpen     PClose
## 1  1997   21    33 0.3888889 0.61111111
## 2  1998   25    29 0.4629630 0.53703704
## 3  1999   23    23 0.5000000 0.50000000
## 4  2000   26    63 0.2921348 0.70786517
## 5  2001   76   107 0.4153005 0.58469945
## 6  2002  105   293 0.2638191 0.73618090
## 7  2003  163   636 0.2040050 0.79599499
## 8  2004  180   468 0.2777778 0.72222222
## 9  2005  204   450 0.3119266 0.68807339
## 10 2006  314   549 0.3638470 0.63615295
## 11 2007  496   737 0.4022709 0.59772912
## 12 2008  729   850 0.4616846 0.53831539
## 13 2009 1229  1260 0.4937726 0.50622740
## 14 2010 2503  1669 0.5999521 0.40004794
## 15 2011 3961  1981 0.6666106 0.33338943
## 16 2012 4642  1673 0.7350752 0.26492478
## 17 2013 6045  1225 0.8314993 0.16850069
## 18 2014 9125   780 0.9212519 0.07874811
ggplot(NewDF,aes(x=factor(Time) , y=Close,group=1)) + geom_line(colour="steelblue1", size=1) + geom_point(colour="steelblue2", size=3) + geom_line( aes(y=Open),colour ="tomato2",size=1 ) + geom_point( aes(y=Open),colour ="red",size= 3) + theme_bw()