Graphic analysis of a text using the package wordcloud
library(NLP)
aFile = readLines("textanalysis.txt")
library(tm)
myCorpus = Corpus(VectorSource(aFile))
myCorpus = tm_map(myCorpus, tolower)
transformation drops documents
myCorpus = tm_map(myCorpus, removePunctuation)
transformation drops documents
myCorpus = tm_map(myCorpus, removeNumbers)
transformation drops documents
myCorpus = tm_map(myCorpus, removeWords, stopwords("english"))
transformation drops documents
myDTM = TermDocumentMatrix(myCorpus, control = list(minWordLength = 3))
m = as.matrix(myDTM)
v = sort(rowSums(m), decreasing = TRUE)
library(wordcloud)
set.seed(433)
wordcloud(names(v), v, scale=c(2,.1), min.freq = 20,
max.words = 50, colors=c("green","red","blue"),
random.color = FALSE)
The model is trained with historical data. With the model parameters you can make a “prediction” of the response variable with new data (predictors).
#training data
customer = read.csv('customer.csv', header=TRUE)
customer
str(customer)
'data.frame': 100 obs. of 5 variables:
$ CustomerID : int 1 2 3 4 5 6 7 8 9 10 ...
$ gender : Factor w/ 2 levels "F","M": 1 2 1 1 2 2 1 1 2 2 ...
$ age : int 36 26 21 49 42 49 47 50 26 40 ...
$ visit.times: int 5 3 2 5 4 1 4 1 2 3 ...
$ buy : Factor w/ 2 levels "no","yes": 2 1 2 2 1 1 2 1 1 1 ...
#fitting the model
logitfit = glm(buy ~ visit.times + age + gender, data=customer, family=binomial(logit))
glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logitfit)
Call:
glm(formula = buy ~ visit.times + age + gender, family = binomial(logit),
data = customer)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.909 0.000 0.000 0.000 1.245
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 26.5278 18.6925 1.419 0.156
visit.times 9.7809 6.1264 1.597 0.110
age -1.1396 0.7592 -1.501 0.133
genderM -71.0222 4170.8348 -0.017 0.986
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 133.7496 on 99 degrees of freedom
Residual deviance: 7.1936 on 96 degrees of freedom
AIC: 15.194
Number of Fisher Scoring iterations: 21
#predicting outcome with new data
CustomerID <- c(1,2,3)
gender <- c("F","F","M")
age <- c(36,50,21)
visit.times <- c(5,1,2)
buy <- c("","","")
custom2 <- data.frame(CustomerID,gender,age,visit.times,buy)
custom2
pr <- predict(logitfit, custom2, type="response")
pr
1 2 3
1.000000e+00 1.055095e-09 2.220446e-16
From a world map you can select a country and cities to be plotted in a map.
library(maps)
library(mapdata)
library(ggplot2)
library(ggrepel)
cities = c("San Juan")
global <- map_data("world")
gg1 <- ggplot() + geom_polygon(data = global, aes(x=long, y = lat, group = group), fill = "green", color = "blue") + coord_fixed(1.3)
gg1
coors <- data.frame(long = c(-66.110000), lat = c(18.450000), stringsAsFactors = FALSE)
coors$cities <- cities
gg1 + geom_point(data=coors, aes(long, lat), colour="red", size=1) +
ggtitle("Puerto Rico") +
geom_text_repel(data=coors, aes(long, lat, label=cities)) + xlim(-68,-64.5) + ylim(17.5,19)