library(ggplot2)
theme_set(theme_bw(base_family = "STKaiti"))
## 主成分分析对数据进行降维
library(R.matlab)
## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.
##
## Attaching package: 'R.matlab'
## The following objects are masked from 'package:base':
##
## getOption, isOpen
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
ETHdata <- readMat("data/chap6/ETH_8class_object_8_big_classes_32_32_1024D.mat")
ETHims <- t(ETHdata$A / 255.0)
dim(ETHims)
## [1] 3280 1024
labels <- as.vector(ETHdata$labels)
table(labels)
## labels
## 1 2 3 4 5 6 7 8
## 410 410 410 410 410 410 410 410
## 可视化碎石图,选择合适的主成分数
parpca <- fa.parallel(ETHims,fa = "pc")
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.
## Parallel analysis suggests that the number of factors = NA and the number of components = 40
## 可视化碎石图的部分图像
pcanum <- 50
plotdata <- data.frame(x = 1:pcanum,pc.values = parpca$pc.values[1:pcanum])
ggplot(plotdata,aes(x = x,y = pc.values))+
theme_bw(base_family = "STKaiti")+
geom_point(colour = "red")+geom_line(colour = "blue")+
labs(x = "主成分个数")
## 提取前40个主成分
ETHcor <- cor(ETHims,ETHims)
dim(ETHcor)
## [1] 1024 1024
ETHpca2 <- principal(ETHims,nfactors = 40)
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.
## 使用pca模型获取数据集的40个主成分
ETHpca40<- predict.psych(ETHpca2,ETHims)
library(caret)
## Loading required package: lattice
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## 使用这40个主成分建立KNN模型
## 切分训练集和测试集
set.seed(12)
index <- createDataPartition(labels,p=0.75)
labels <- as.factor(labels)
train_ETH <- ETHpca40[index$Resample1,]
train_lab <- labels[index$Resample1]
test_ETH <- ETHpca40[-index$Resample1,]
test_lab <- labels[-index$Resample1]
## KNN分类器
ETHknn <- knn3(x=train_ETH,y=train_lab,k=5)
ETHknn
## 5-nearest neighbor model
## Training set outcome distribution:
##
## 1 2 3 4 5 6 7 8
## 311 304 302 313 312 303 308 307
test_pre <- predict(ETHknn,test_ETH,type = "class")
## 计算KNN模型的精度
table(test_lab,test_pre)
## test_pre
## test_lab 1 2 3 4 5 6 7 8
## 1 81 0 0 0 0 0 0 18
## 2 0 94 1 0 6 2 3 0
## 3 0 3 73 1 11 18 0 2
## 4 0 0 0 91 0 0 0 6
## 5 0 1 15 0 64 16 2 0
## 6 0 1 15 0 8 83 0 0
## 7 0 0 0 0 0 0 102 0
## 8 1 0 0 0 0 0 0 102
sprintf("KNN分类精度为%.4f",accuracy(test_lab,test_pre))
## [1] "KNN分类精度为0.8415"
## 使用混淆矩阵热力图可视化哪些预测正确
confum <- confusionMatrix(test_lab,test_pre)
confum
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3 4 5 6 7 8
## 1 81 0 0 0 0 0 0 18
## 2 0 94 1 0 6 2 3 0
## 3 0 3 73 1 11 18 0 2
## 4 0 0 0 91 0 0 0 6
## 5 0 1 15 0 64 16 2 0
## 6 0 1 15 0 8 83 0 0
## 7 0 0 0 0 0 0 102 0
## 8 1 0 0 0 0 0 0 102
##
## Overall Statistics
##
## Accuracy : 0.8415
## 95% CI : (0.8146, 0.8658)
## No Information Rate : 0.1561
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8187
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity 0.98780 0.9495 0.70192 0.9891 0.71910 0.6975
## Specificity 0.97561 0.9834 0.95112 0.9918 0.95349 0.9658
## Pos Pred Value 0.81818 0.8868 0.67593 0.9381 0.65306 0.7757
## Neg Pred Value 0.99861 0.9930 0.95646 0.9986 0.96537 0.9495
## Prevalence 0.10000 0.1207 0.12683 0.1122 0.10854 0.1451
## Detection Rate 0.09878 0.1146 0.08902 0.1110 0.07805 0.1012
## Detection Prevalence 0.12073 0.1293 0.13171 0.1183 0.11951 0.1305
## Balanced Accuracy 0.98171 0.9664 0.82652 0.9904 0.83629 0.8316
## Class: 7 Class: 8
## Sensitivity 0.9533 0.7969
## Specificity 1.0000 0.9986
## Pos Pred Value 1.0000 0.9903
## Neg Pred Value 0.9930 0.9637
## Prevalence 0.1305 0.1561
## Detection Rate 0.1244 0.1244
## Detection Prevalence 0.1244 0.1256
## Balanced Accuracy 0.9766 0.8977
confumat <- as.data.frame(confum$table)
confumat[,1:2] <- apply(confumat[,1:2],2,as.integer)
ggplot(confumat,aes(x=Reference,y = Prediction))+
geom_tile(aes(fill = Freq))+
geom_text(aes(label = Freq))+
scale_x_continuous(breaks = c(0:8))+
scale_y_continuous(breaks = unique(confumat$Prediction),
trans = "reverse")+
scale_fill_gradient2(low="darkblue", high="lightgreen",
guide="colorbar")+
ggtitle("KNN分类器在测试集结果")
## 参数搜索,找到精度更高的模型
set.seed(123)
## 使用交叉验证,5 fold cv
trcl <- trainControl(method="cv",number = 5)
trgrid <- expand.grid(k=seq(1,25,2))
ETHknnFit <- train(x=train_ETH,y=train_lab, method = "knn",
trControl = trcl,tuneGrid = trgrid)
ETHknnFit
## k-Nearest Neighbors
##
## 2460 samples
## 40 predictor
## 8 classes: '1', '2', '3', '4', '5', '6', '7', '8'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1970, 1967, 1966, 1970, 1967
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.8711549 0.8527493
## 3 0.8503850 0.8290112
## 5 0.8402421 0.8174145
## 7 0.8300885 0.8058106
## 9 0.8259936 0.8011274
## 11 0.8097374 0.7825469
## 13 0.8028061 0.7746248
## 15 0.7954914 0.7662632
## 17 0.7869448 0.7564965
## 19 0.7812512 0.7499867
## 21 0.7731442 0.7407209
## 23 0.7617454 0.7276935
## 25 0.7564434 0.7216332
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
## plot 近邻数和精度的关系
plot(ETHknnFit,main="KNN",family = "STKaiti")
## 美国不同地区的平均房价预测数据集
## 读取数据
house <- read.csv("data/chap5/USA_Housing.csv")
dim(house)
## [1] 5000 6
colnames(house)
## [1] "AvgAreaIncome" "AvgAreaHouseAge"
## [3] "AvgAreaNumberRooms" "AvgAreaNumberofBedrooms"
## [5] "AreaPopulation" "AvgPrice"
## 数据标准化,切分为训练数据和测试数据
set.seed(12)
house[,1:5] <- apply(house[,1:5],2,scale)
index <- createDataPartition(house$AvgPrice,p=0.7)
train_house <- house[index$Resample1,]
test_house <- house[-index$Resample1,]
## KNN 回归
houseknn <- knnreg(AvgPrice~.,train_house,k=5)
housetest_pre <- predict(houseknn,test_house)
errormae <- mae(test_house$AvgPrice,housetest_pre)
sprintf("KNN回归的绝对值误差为%.2f",errormae)
## [1] "KNN回归的绝对值误差为105883.26"
## 分析不同的K值下,房价对测试集的预测误差
ks <- seq(1,30,2)
pricemae <- ks
for(ii in 1:length(ks)){
houseknnii <- knnreg(AvgPrice~.,train_house,k=ks[ii])
housetest_preii <- predict(houseknnii,test_house)
pricemae[ii] <- mae(test_house$AvgPrice,housetest_preii)
}
data.frame(k = ks,error_mae = pricemae)%>%
ggplot(aes(x=k,y=error_mae))+
geom_line()+geom_point(colour="red")+
scale_x_continuous(breaks = ks)+
labs(x="近邻数量",y = "绝对值误差",title = "房价的KNN回归")
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(dplyr)
library(tidytext)
library(tidyr)
library(pheatmap)
library(textreg)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(wordcloud)
## Loading required package: RColorBrewer
library(stringr)
## 读取数据
spam <- read.csv("data/chap8/spam.csv",
stringsAsFactors = F)
## 对没有正确读取的数据进行修正
strjoin <- function(x){
x <- as.vector(x)
text <- ""
for (ii in 1:length(x)){
text <- str_c(text,x[ii],sep = " ")
}
return(text)
}
spam[,2] <- apply(spam[,2:ncol(spam)],1, strjoin)
spam[,3:ncol(spam)] <- NULL
colnames(spam) <- c("label","text")
spam$label <- as.factor(spam$label)
table(spam$label)
##
## ham spam
## 4825 747
##构建语料库,
spam_cp <- Corpus(VectorSource(spam$text))
## 剔除非英文字符
deletnoneEng <- function(s){
gsub(pattern = '[^a-zA-Z0-9\\s]+',
x = s,replacement = "",
ignore.case = TRUE,
perl = TRUE)
}
spam_cp <- tm_map(spam_cp, content_transformer(deletnoneEng))
## Warning in tm_map.SimpleCorpus(spam_cp, content_transformer(deletnoneEng)):
## transformation drops documents
## 去处语料库中的所有数字
spam_cp <- tm_map(spam_cp,removeNumbers)
## Warning in tm_map.SimpleCorpus(spam_cp, removeNumbers): transformation drops
## documents
## 从文本文档中删除标点符号
spam_cp <- tm_map(spam_cp,removePunctuation)
## Warning in tm_map.SimpleCorpus(spam_cp, removePunctuation): transformation drops
## documents
## 将所有的字母均转化为小写
spam_cp_clearn <- tm_map(spam_cp,tolower)
## Warning in tm_map.SimpleCorpus(spam_cp, tolower): transformation drops documents
## 去除停用词
spam_cp <- tm_map(spam_cp,removeWords,stopwords())
## Warning in tm_map.SimpleCorpus(spam_cp, removeWords, stopwords()):
## transformation drops documents
## 去除额外的空格
spam_cp <- tm_map(spam_cp,stripWhitespace)
## Warning in tm_map.SimpleCorpus(spam_cp, stripWhitespace): transformation drops
## documents
## 将文本词干化
spam_cp <- tm_map(spam_cp,stemDocument)
## Warning in tm_map.SimpleCorpus(spam_cp, stemDocument): transformation drops
## documents
## 可视化两种情感文本的词云
spam_pro <- data.frame(text=sapply(spam_cp, identity), stringsAsFactors=F)
spam_pro$label <- spam$label
wordfre <- spam_pro%>%unnest_tokens(word,text)%>%
group_by(label,word)%>%
summarise(Fre = n())%>%
arrange(desc(Fre)) %>%
acast(word~label,value.var = "Fre",fill = 0)
## 可视化两种类型邮件的词云
comparison.cloud(wordfre,scale=c(4,.5),max.words=180,
title.size=1.5,colors = c("gray50","gray10"))
## 找到频繁出现的词语,出现频率大于5
dict <- names(which(wordfre[,1]+wordfre[,2] >5))
## 构建TF矩阵
spam_dtm <- DocumentTermMatrix(spam_cp,control = list(dictionary = dict))
spam_dtm
## <<DocumentTermMatrix (documents: 5572, terms: 1416)>>
## Non-/sparse entries: 36809/7853143
## Sparsity : 100%
## Maximal term length: 19
## Weighting : term frequency (tf)
## 通常情况下,文档-词语的tf-idf矩阵非常的稀疏,可以删除一些不重要的词来缓解矩阵的稀疏性,同时提高计算效率
dim(spam_dtm)
## [1] 5572 1416
spam_dtm <- removeSparseTerms(spam_dtm,0.999)
spam_dtm
## <<DocumentTermMatrix (documents: 5572, terms: 1252)>>
## Non-/sparse entries: 36634/6939510
## Sparsity : 99%
## Maximal term length: 19
## Weighting : term frequency (tf)
## 此时文档-词语的tf-idf矩阵稀疏性得到了缓解,而且词语的数量也减少到了2194个
## 可视化随机抽取100行和100列可视化tf-idf矩阵热力图
set.seed(123)
index <- sample(min(dim(spam_dtm)),100)
pheatmap(as.matrix(spam_dtm)[index,index],cluster_rows = FALSE,
cluster_cols = FALSE,show_rownames = FALSE,
show_colnames = T,main = "TF Matrix Part",
fontsize_col = 5)
save(spam_cp,file = "data/chap8/spam_cp.RData")
save(spam,file = "data/chap8/spam.RData")
library(e1071)
library(naivebayes)
## naivebayes 0.9.6 loaded
library(Metrics)
library(gmodels)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:wordcloud':
##
## textplot
## The following object is masked from 'package:stats':
##
## lowess
## 数据随机切分为75%训练集和25%测试集
set.seed(123)
index <- sample(nrow(spam),nrow(spam)*0.75)
spam_dtm2mat <- as.matrix(spam_dtm)
train_x <- spam_dtm2mat[index,]
train_y <- spam$label[index]
test_x <- spam_dtm2mat[-index,]
test_y <- spam$label[-index]
## 将每个词项所代表的特征转化为对应单词的因子变量
train_x <- apply(train_x, 2, function(x) as.factor(ifelse(x>0,1,0)))
test_x <- apply(test_x, 2, function(x) as.factor(ifelse(x>0,1,0)))
## 使用e1071包中的naiveBayes建立模型
spamnb <- naiveBayes(x = train_x,y = train_y,laplace = 1)
summary(spamnb)
## Length Class Mode
## apriori 2 table numeric
## tables 1252 -none- list
## levels 2 -none- character
## isnumeric 1252 -none- logical
## call 4 -none- call
## 对测试集进行预测,查看模型的精度
test_pre <- predict(spamnb,test_x,type = "class")
CrossTable(test_y,test_pre,prop.r = F,prop.t = F,prop.chisq = F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1393
##
##
## | test_pre
## test_y | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1203 | 3 | 1206 |
## | 0.985 | 0.017 | |
## -------------|-----------|-----------|-----------|
## spam | 18 | 169 | 187 |
## | 0.015 | 0.983 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1221 | 172 | 1393 |
## | 0.877 | 0.123 | |
## -------------|-----------|-----------|-----------|
##
##
table(test_y,test_pre)
## test_pre
## test_y ham spam
## ham 1203 3
## spam 18 169
sprintf("朴素贝叶斯的识别精度为%.4f",accuracy(test_y,test_pre))
## [1] "朴素贝叶斯的识别精度为0.9849"
pred <- prediction(as.integer(test_pre),as.integer(test_y))
par(pty=c("s"))
plot(performance(pred,"tpr","fpr"),main = "Naive Bayes")
## 使用naivebayes包中的naive_bayes建立模型
# spamnb2 <- naive_bayes(x = train_x,y = train_y,laplace = 1)
# summary(spamnb2)
## 对测试集进行预测,查看模型的精度
# test_pre2 <- predict(spamnb2,test_x,type = "class")
# table(test_pre2,test_y)
# accuracy(test_y,test_pre2)