#install.packages("data.table")
library(data.table)
## Warning: package 'data.table' was built under R version 3.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
DF <- fread("example_coffee.csv",header=T,stringsAsFactors = T,data.table = F)
str(DF)
## 'data.frame': 46832 obs. of 23 variables:
## $ number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ companyName : Factor w/ 36991 levels "#11(Sharp eleven)",..: 1 3 4 5 6 7 8 9 10 11 ...
## $ adress : Factor w/ 45170 levels "","강원도 강릉시 강동면 안인진리 3-5번지 통일공원 G동 2층",..: 32672 44985 14319 25658 35239 44592 44032 16782 4036 16970 ...
## $ adressBystreet : Factor w/ 38866 levels "","강원도 강릉시 가작로 13, 1층 (교동)",..: 28058 38633 12180 22436 1 38386 37843 14490 3373 14482 ...
## $ dateOflicensing : int 20080917 20101124 20130902 20121108 20020911 20130822 20140605 20111209 20130315 20110908 ...
## $ stateOfbusiness : Factor w/ 2 levels "운영중","폐업 등": 1 1 1 1 2 1 1 1 1 1 ...
## $ dateOfclosure : int NA NA NA NA 20071105 NA NA NA NA NA ...
## $ startdateOfcessation: logi NA NA NA NA NA NA ...
## $ duedateOfcessation : logi NA NA NA NA NA NA ...
## $ dateOfreOpen : logi NA NA NA NA NA NA ...
## $ areaOfsite : logi NA NA NA NA NA NA ...
## $ zip : logi NA NA NA NA NA NA ...
## $ waterwork : Factor w/ 5 levels "","간이상수도",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ numOfmenWorker : int NA NA NA NA 0 NA NA NA NA NA ...
## $ yearOfStart : int 2008 2010 2013 2012 2002 2013 2014 2011 2013 2011 ...
## $ multipleUse : Factor w/ 3 levels "","N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ grade : Factor w/ 8 levels "","갑","관리",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sizeOfsite : num 20.8 212.7 20 64.2 11.4 ...
## $ numOfwomenWorker : int NA NA NA NA 0 NA NA NA NA NA ...
## $ vicintyOfsite : Factor w/ 8 levels "","결혼예식장주변",..: 1 1 4 1 1 1 1 3 1 3 ...
## $ sanitaryName : Factor w/ 2 levels "","휴게음식점": 2 2 2 2 2 2 2 2 2 2 ...
## $ businessCondition : Factor w/ 2 levels "","커피숍": 2 2 2 2 2 2 2 2 2 2 ...
## $ totalOfworker : int NA NA NA NA 0 NA NA NA NA NA ...
DF = subset(DF,select=c(-adress,-adressBystreet,-dateOfclosure, -startdateOfcessation,-duedateOfcessation,-dateOfreOpen,-zip))
str(DF)
## 'data.frame': 46832 obs. of 16 variables:
## $ number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ companyName : Factor w/ 36991 levels "#11(Sharp eleven)",..: 1 3 4 5 6 7 8 9 10 11 ...
## $ dateOflicensing : int 20080917 20101124 20130902 20121108 20020911 20130822 20140605 20111209 20130315 20110908 ...
## $ stateOfbusiness : Factor w/ 2 levels "운영중","폐업 등": 1 1 1 1 2 1 1 1 1 1 ...
## $ areaOfsite : logi NA NA NA NA NA NA ...
## $ waterwork : Factor w/ 5 levels "","간이상수도",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ numOfmenWorker : int NA NA NA NA 0 NA NA NA NA NA ...
## $ yearOfStart : int 2008 2010 2013 2012 2002 2013 2014 2011 2013 2011 ...
## $ multipleUse : Factor w/ 3 levels "","N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ grade : Factor w/ 8 levels "","갑","관리",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ sizeOfsite : num 20.8 212.7 20 64.2 11.4 ...
## $ numOfwomenWorker : int NA NA NA NA 0 NA NA NA NA NA ...
## $ vicintyOfsite : Factor w/ 8 levels "","결혼예식장주변",..: 1 1 4 1 1 1 1 3 1 3 ...
## $ sanitaryName : Factor w/ 2 levels "","휴게음식점": 2 2 2 2 2 2 2 2 2 2 ...
## $ businessCondition: Factor w/ 2 levels "","커피숍": 2 2 2 2 2 2 2 2 2 2 ...
## $ totalOfworker : int NA NA NA NA 0 NA NA NA NA NA ...
range(DF$yearOfStart, na.rm = T)
## [1] 1964 2015
#na.rm 은 결측치 처리를 위해 반드시 넣어야 range가 동작함.
subset(DF,subset=(yearOfStart==1964))
## number companyName dateOflicensing stateOfbusiness areaOfsite
## 23035 23035 엠에스커피 19641125 폐업 등 NA
## 46290 46290 홀릭 19640929 폐업 등 NA
## waterwork numOfmenWorker yearOfStart multipleUse grade sizeOfsite
## 23035 상수도전용 0 1964 N 0
## 46290 0 1964 N 기타 0
## numOfwomenWorker vicintyOfsite sanitaryName businessCondition
## 23035 0 기타 휴게음식점 커피숍
## 46290 0 기타 휴게음식점 커피숍
## totalOfworker
## 23035 0
## 46290 0
DFFilter = subset(DF,subset = (stateOfbusiness=="운영중"))
range(DFFilter$yearOfStart,na.rm=T)
## [1] 1967 2015
?subset
## starting httpd help server ...
## done
subset(DFFilter,subset=(yearOfStart==1967 ))
## number companyName dateOflicensing stateOfbusiness areaOfsite
## 24108 24108 왕관 커피숍 19671013 운영중 NA
## 44934 44934 학커피숍 19670414 운영중 NA
## waterwork numOfmenWorker yearOfStart multipleUse grade sizeOfsite
## 24108 상수도전용 0 1967 N 갑 76.02
## 44934 0 1967 N 59.13
## numOfwomenWorker vicintyOfsite sanitaryName businessCondition
## 24108 1 기타 휴게음식점 커피숍
## 44934 0 휴게음식점 커피숍
## totalOfworker
## 24108 1
## 44934 0
table(DF$yearOfStart)
##
## 1964 1966 1967 1968 1969 1970 1971 1972 1974 1975 1976 1979 1980 1981 1982
## 2 2 3 1 2 4 6 3 1 2 5 4 9 8 12
## 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
## 9 11 18 21 21 26 23 25 28 37 50 48 48 41 54
## 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
## 54 46 89 183 398 799 648 654 863 1233 1579 2489 4172 5942 6315
## 2013 2014 2015
## 7270 9905 3650
qplot(yearOfStart, data=DF, geom = "bar")
## Warning: Removed 19 rows containing non-finite values (stat_count).

ggplot(data = DF , aes(x=yearOfStart) ) + geom_bar(binwidth=1,size=0,color="red")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.
## Warning: Removed 19 rows containing non-finite values (stat_bin).

#위 두개 플롯은 같은 표시임.
Freq = table(DF$stateOfbusiness,DF$yearOfStart)
Freq
##
## 1964 1966 1967 1968 1969 1970 1971 1972 1974 1975 1976 1979 1980
## 운영중 0 0 2 0 0 2 4 2 1 1 1 2 3
## 폐업 등 2 2 1 1 2 2 2 1 0 1 4 2 6
##
## 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
## 운영중 6 2 3 4 5 5 6 11 5 7 7 3 14
## 폐업 등 2 10 6 7 13 16 15 15 18 18 21 34 36
##
## 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## 운영중 14 13 14 21 25 23 26 76 105 163 180 204 314
## 폐업 등 34 35 27 33 29 23 63 107 293 636 468 450 549
##
## 2007 2008 2009 2010 2011 2012 2013 2014 2015
## 운영중 496 729 1229 2503 3961 4642 6045 9125 3564
## 폐업 등 737 850 1260 1669 1981 1673 1225 780 86
which(colnames(Freq) =="1997")
## [1] 30
which.max(colnames(Freq))
## [1] 48
Freq = Freq[,c(30:47)]
PFreq = prop.table(Freq,margin=2)
PFreq
##
## 1997 1998 1999 2000 2001
## 운영중 0.38888889 0.46296296 0.50000000 0.29213483 0.41530055
## 폐업 등 0.61111111 0.53703704 0.50000000 0.70786517 0.58469945
##
## 2002 2003 2004 2005 2006
## 운영중 0.26381910 0.20400501 0.27777778 0.31192661 0.36384705
## 폐업 등 0.73618090 0.79599499 0.72222222 0.68807339 0.63615295
##
## 2007 2008 2009 2010 2011
## 운영중 0.40227088 0.46168461 0.49377260 0.59995206 0.66661057
## 폐업 등 0.59772912 0.53831539 0.50622740 0.40004794 0.33338943
##
## 2012 2013 2014
## 운영중 0.73507522 0.83149931 0.92125189
## 폐업 등 0.26492478 0.16850069 0.07874811
NewDF = data.frame(colnames(Freq), Freq[1,],Freq[2,],PFreq[1,],PFreq[2,])
NewDF
## colnames.Freq. Freq.1... Freq.2... PFreq.1... PFreq.2...
## 1997 1997 21 33 0.3888889 0.61111111
## 1998 1998 25 29 0.4629630 0.53703704
## 1999 1999 23 23 0.5000000 0.50000000
## 2000 2000 26 63 0.2921348 0.70786517
## 2001 2001 76 107 0.4153005 0.58469945
## 2002 2002 105 293 0.2638191 0.73618090
## 2003 2003 163 636 0.2040050 0.79599499
## 2004 2004 180 468 0.2777778 0.72222222
## 2005 2005 204 450 0.3119266 0.68807339
## 2006 2006 314 549 0.3638470 0.63615295
## 2007 2007 496 737 0.4022709 0.59772912
## 2008 2008 729 850 0.4616846 0.53831539
## 2009 2009 1229 1260 0.4937726 0.50622740
## 2010 2010 2503 1669 0.5999521 0.40004794
## 2011 2011 3961 1981 0.6666106 0.33338943
## 2012 2012 4642 1673 0.7350752 0.26492478
## 2013 2013 6045 1225 0.8314993 0.16850069
## 2014 2014 9125 780 0.9212519 0.07874811
rownames(NewDF) = NULL
colnames(NewDF) = c("Time","Open","Close","POpen","PClose")
NewDF
## Time Open Close POpen PClose
## 1 1997 21 33 0.3888889 0.61111111
## 2 1998 25 29 0.4629630 0.53703704
## 3 1999 23 23 0.5000000 0.50000000
## 4 2000 26 63 0.2921348 0.70786517
## 5 2001 76 107 0.4153005 0.58469945
## 6 2002 105 293 0.2638191 0.73618090
## 7 2003 163 636 0.2040050 0.79599499
## 8 2004 180 468 0.2777778 0.72222222
## 9 2005 204 450 0.3119266 0.68807339
## 10 2006 314 549 0.3638470 0.63615295
## 11 2007 496 737 0.4022709 0.59772912
## 12 2008 729 850 0.4616846 0.53831539
## 13 2009 1229 1260 0.4937726 0.50622740
## 14 2010 2503 1669 0.5999521 0.40004794
## 15 2011 3961 1981 0.6666106 0.33338943
## 16 2012 4642 1673 0.7350752 0.26492478
## 17 2013 6045 1225 0.8314993 0.16850069
## 18 2014 9125 780 0.9212519 0.07874811
ggplot(NewDF,aes(x=factor(Time) , y=Close,group=1)) + geom_line(colour="steelblue1", size=1) + geom_point(colour="steelblue2", size=3) + geom_line( aes(y=Open),colour ="tomato2",size=1 ) + geom_point( aes(y=Open),colour ="red",size= 3) + theme_bw()
