第8章: K-近邻和朴素贝叶斯

统一设置ggplot2的绘图风格

library(ggplot2)
theme_set(theme_bw(base_family = "STKaiti"))

8.1：KNN算法

对主成分分析降维后的图像数据集进行KNN分类

## 主成分分析对数据进行降维
library(R.matlab)

## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.

## 
## Attaching package: 'R.matlab'

## The following objects are masked from 'package:base':
## 
##     getOption, isOpen

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

ETHdata <- readMat("data/chap6/ETH_8class_object_8_big_classes_32_32_1024D.mat")

ETHims <- t(ETHdata$A / 255.0)
dim(ETHims)

## [1] 3280 1024

labels <- as.vector(ETHdata$labels)
table(labels)

## labels
##   1   2   3   4   5   6   7   8 
## 410 410 410 410 410 410 410 410

## 可视化碎石图，选择合适的主成分数
parpca <- fa.parallel(ETHims,fa = "pc")

## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.

## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  40

## 可视化碎石图的部分图像
pcanum <- 50
plotdata <- data.frame(x = 1:pcanum,pc.values = parpca$pc.values[1:pcanum])
ggplot(plotdata,aes(x = x,y = pc.values))+
  theme_bw(base_family = "STKaiti")+
  geom_point(colour = "red")+geom_line(colour = "blue")+
  labs(x = "主成分个数")

## 提取前40个主成分
ETHcor <- cor(ETHims,ETHims)
dim(ETHcor)

## [1] 1024 1024

ETHpca2 <- principal(ETHims,nfactors = 40)

## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.

## 使用pca模型获取数据集的40个主成分
ETHpca40<- predict.psych(ETHpca2,ETHims)

使用这40个主成分建立KNN模型

library(caret)

## Loading required package: lattice

library(Metrics)

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## 使用这40个主成分建立KNN模型

## 切分训练集和测试集
set.seed(12)
index <- createDataPartition(labels,p=0.75)
labels <- as.factor(labels)
train_ETH <- ETHpca40[index$Resample1,]
train_lab <- labels[index$Resample1]
test_ETH <- ETHpca40[-index$Resample1,]
test_lab <- labels[-index$Resample1]

## KNN分类器
ETHknn <- knn3(x=train_ETH,y=train_lab,k=5)
ETHknn

## 5-nearest neighbor model
## Training set outcome distribution:
## 
##   1   2   3   4   5   6   7   8 
## 311 304 302 313 312 303 308 307

test_pre <- predict(ETHknn,test_ETH,type = "class")
## 计算KNN模型的精度
table(test_lab,test_pre)

##         test_pre
## test_lab   1   2   3   4   5   6   7   8
##        1  81   0   0   0   0   0   0  18
##        2   0  94   1   0   6   2   3   0
##        3   0   3  73   1  11  18   0   2
##        4   0   0   0  91   0   0   0   6
##        5   0   1  15   0  64  16   2   0
##        6   0   1  15   0   8  83   0   0
##        7   0   0   0   0   0   0 102   0
##        8   1   0   0   0   0   0   0 102

sprintf("KNN分类精度为%.4f",accuracy(test_lab,test_pre))

## [1] "KNN分类精度为0.8415"

## 使用混淆矩阵热力图可视化哪些预测正确
confum <- confusionMatrix(test_lab,test_pre)
confum

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   1   2   3   4   5   6   7   8
##          1  81   0   0   0   0   0   0  18
##          2   0  94   1   0   6   2   3   0
##          3   0   3  73   1  11  18   0   2
##          4   0   0   0  91   0   0   0   6
##          5   0   1  15   0  64  16   2   0
##          6   0   1  15   0   8  83   0   0
##          7   0   0   0   0   0   0 102   0
##          8   1   0   0   0   0   0   0 102
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8415          
##                  95% CI : (0.8146, 0.8658)
##     No Information Rate : 0.1561          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8187          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5 Class: 6
## Sensitivity           0.98780   0.9495  0.70192   0.9891  0.71910   0.6975
## Specificity           0.97561   0.9834  0.95112   0.9918  0.95349   0.9658
## Pos Pred Value        0.81818   0.8868  0.67593   0.9381  0.65306   0.7757
## Neg Pred Value        0.99861   0.9930  0.95646   0.9986  0.96537   0.9495
## Prevalence            0.10000   0.1207  0.12683   0.1122  0.10854   0.1451
## Detection Rate        0.09878   0.1146  0.08902   0.1110  0.07805   0.1012
## Detection Prevalence  0.12073   0.1293  0.13171   0.1183  0.11951   0.1305
## Balanced Accuracy     0.98171   0.9664  0.82652   0.9904  0.83629   0.8316
##                      Class: 7 Class: 8
## Sensitivity            0.9533   0.7969
## Specificity            1.0000   0.9986
## Pos Pred Value         1.0000   0.9903
## Neg Pred Value         0.9930   0.9637
## Prevalence             0.1305   0.1561
## Detection Rate         0.1244   0.1244
## Detection Prevalence   0.1244   0.1256
## Balanced Accuracy      0.9766   0.8977

confumat <- as.data.frame(confum$table)
confumat[,1:2] <- apply(confumat[,1:2],2,as.integer)

ggplot(confumat,aes(x=Reference,y = Prediction))+
  geom_tile(aes(fill = Freq))+
  geom_text(aes(label = Freq))+
  scale_x_continuous(breaks = c(0:8))+
  scale_y_continuous(breaks = unique(confumat$Prediction),
                     trans = "reverse")+
  scale_fill_gradient2(low="darkblue", high="lightgreen", 
                       guide="colorbar")+
  ggtitle("KNN分类器在测试集结果")

## 参数搜索，找到精度更高的模型
set.seed(123)
## 使用交叉验证,5 fold cv
trcl <- trainControl(method="cv",number = 5) 
trgrid <- expand.grid(k=seq(1,25,2))
ETHknnFit <- train(x=train_ETH,y=train_lab, method = "knn",
                   trControl = trcl,tuneGrid = trgrid)


ETHknnFit

## k-Nearest Neighbors 
## 
## 2460 samples
##   40 predictor
##    8 classes: '1', '2', '3', '4', '5', '6', '7', '8' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 1970, 1967, 1966, 1970, 1967 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    1  0.8711549  0.8527493
##    3  0.8503850  0.8290112
##    5  0.8402421  0.8174145
##    7  0.8300885  0.8058106
##    9  0.8259936  0.8011274
##   11  0.8097374  0.7825469
##   13  0.8028061  0.7746248
##   15  0.7954914  0.7662632
##   17  0.7869448  0.7564965
##   19  0.7812512  0.7499867
##   21  0.7731442  0.7407209
##   23  0.7617454  0.7276935
##   25  0.7564434  0.7216332
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.

## plot 近邻数和精度的关系
plot(ETHknnFit,main="KNN",family = "STKaiti")

KNN 回归

## 美国不同地区的平均房价预测数据集
## 读取数据
house <- read.csv("data/chap5/USA_Housing.csv")
dim(house)

## [1] 5000    6

colnames(house)

## [1] "AvgAreaIncome"           "AvgAreaHouseAge"        
## [3] "AvgAreaNumberRooms"      "AvgAreaNumberofBedrooms"
## [5] "AreaPopulation"          "AvgPrice"

## 数据标准化，切分为训练数据和测试数据
set.seed(12)
house[,1:5] <- apply(house[,1:5],2,scale)
index <- createDataPartition(house$AvgPrice,p=0.7)
train_house <- house[index$Resample1,]
test_house <- house[-index$Resample1,]
## KNN 回归
houseknn <- knnreg(AvgPrice~.,train_house,k=5)
housetest_pre <- predict(houseknn,test_house)
errormae <- mae(test_house$AvgPrice,housetest_pre)
sprintf("KNN回归的绝对值误差为%.2f",errormae)

## [1] "KNN回归的绝对值误差为105883.26"

## 分析不同的K值下，房价对测试集的预测误差
ks <- seq(1,30,2)
pricemae <- ks
for(ii in 1:length(ks)){
  houseknnii <- knnreg(AvgPrice~.,train_house,k=ks[ii])
  housetest_preii <- predict(houseknnii,test_house)
  pricemae[ii] <- mae(test_house$AvgPrice,housetest_preii)
}
data.frame(k = ks,error_mae = pricemae)%>%
  ggplot(aes(x=k,y=error_mae))+
  geom_line()+geom_point(colour="red")+
  scale_x_continuous(breaks = ks)+
  labs(x="近邻数量",y = "绝对值误差",title = "房价的KNN回归")

8.2：朴素贝叶斯方法

针对垃圾邮件数据，进行识别

library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(dplyr)
library(tidytext)
library(tidyr)
library(pheatmap)
library(textreg)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(wordcloud)

## Loading required package: RColorBrewer

library(stringr)
## 读取数据
spam <- read.csv("data/chap8/spam.csv",
                 stringsAsFactors = F)

## 对没有正确读取的数据进行修正
strjoin <- function(x){
  x <- as.vector(x)
  text <- ""
  for (ii in 1:length(x)){
    text <- str_c(text,x[ii],sep = " ")
  }
  return(text)
}
spam[,2] <- apply(spam[,2:ncol(spam)],1, strjoin)
spam[,3:ncol(spam)] <- NULL
colnames(spam) <- c("label","text")
spam$label <- as.factor(spam$label)
table(spam$label)

## 
##  ham spam 
## 4825  747

##构建语料库,
spam_cp <- Corpus(VectorSource(spam$text))
## 剔除非英文字符
deletnoneEng <- function(s){
  gsub(pattern = '[^a-zA-Z0-9\\s]+',
       x = s,replacement = "",
       ignore.case = TRUE,
       perl = TRUE)
  }
spam_cp <- tm_map(spam_cp, content_transformer(deletnoneEng))

## Warning in tm_map.SimpleCorpus(spam_cp, content_transformer(deletnoneEng)):
## transformation drops documents

## 去处语料库中的所有数字
spam_cp <- tm_map(spam_cp,removeNumbers)

## Warning in tm_map.SimpleCorpus(spam_cp, removeNumbers): transformation drops
## documents

## 从文本文档中删除标点符号
spam_cp <- tm_map(spam_cp,removePunctuation)

## Warning in tm_map.SimpleCorpus(spam_cp, removePunctuation): transformation drops
## documents

## 将所有的字母均转化为小写
spam_cp_clearn <- tm_map(spam_cp,tolower)

## Warning in tm_map.SimpleCorpus(spam_cp, tolower): transformation drops documents

## 去除停用词
spam_cp <- tm_map(spam_cp,removeWords,stopwords())

## Warning in tm_map.SimpleCorpus(spam_cp, removeWords, stopwords()):
## transformation drops documents

## 去除额外的空格
spam_cp <- tm_map(spam_cp,stripWhitespace)

## Warning in tm_map.SimpleCorpus(spam_cp, stripWhitespace): transformation drops
## documents

## 将文本词干化
spam_cp <- tm_map(spam_cp,stemDocument)

## Warning in tm_map.SimpleCorpus(spam_cp, stemDocument): transformation drops
## documents

## 可视化两种情感文本的词云
spam_pro <- data.frame(text=sapply(spam_cp, identity), stringsAsFactors=F)
spam_pro$label <- spam$label
wordfre <- spam_pro%>%unnest_tokens(word,text)%>%
  group_by(label,word)%>%
  summarise(Fre = n())%>%
  arrange(desc(Fre)) %>%
  acast(word~label,value.var = "Fre",fill = 0)
  
## 可视化两种类型邮件的词云
comparison.cloud(wordfre,scale=c(4,.5),max.words=180,
                 title.size=1.5,colors = c("gray50","gray10"))

## 找到频繁出现的词语,出现频率大于5
dict <- names(which(wordfre[,1]+wordfre[,2] >5))

## 构建TF矩阵
spam_dtm <- DocumentTermMatrix(spam_cp,control = list(dictionary = dict))
spam_dtm

## <<DocumentTermMatrix (documents: 5572, terms: 1416)>>
## Non-/sparse entries: 36809/7853143
## Sparsity           : 100%
## Maximal term length: 19
## Weighting          : term frequency (tf)

## 通常情况下，文档－词语的tf-idf矩阵非常的稀疏，可以删除一些不重要的词来缓解矩阵的稀疏性，同时提高计算效率
dim(spam_dtm)

## [1] 5572 1416

spam_dtm <- removeSparseTerms(spam_dtm,0.999)
spam_dtm

## <<DocumentTermMatrix (documents: 5572, terms: 1252)>>
## Non-/sparse entries: 36634/6939510
## Sparsity           : 99%
## Maximal term length: 19
## Weighting          : term frequency (tf)

## 此时文档－词语的tf-idf矩阵稀疏性得到了缓解，而且词语的数量也减少到了2194个
## 可视化随机抽取100行和100列可视化tf-idf矩阵热力图
set.seed(123)
index <- sample(min(dim(spam_dtm)),100)
pheatmap(as.matrix(spam_dtm)[index,index],cluster_rows = FALSE,
         cluster_cols = FALSE,show_rownames = FALSE, 
         show_colnames = T,main = "TF Matrix Part",
         fontsize_col = 5)

save(spam_cp,file = "data/chap8/spam_cp.RData")
save(spam,file = "data/chap8/spam.RData")

贝叶斯分类器

library(e1071)
library(naivebayes)

## naivebayes 0.9.6 loaded

library(Metrics)
library(gmodels)
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:wordcloud':
## 
##     textplot

## The following object is masked from 'package:stats':
## 
##     lowess

## 数据随机切分为75%训练集和25%测试集
set.seed(123)
index <- sample(nrow(spam),nrow(spam)*0.75)
spam_dtm2mat <- as.matrix(spam_dtm)
train_x <- spam_dtm2mat[index,]
train_y <- spam$label[index]
test_x <- spam_dtm2mat[-index,]
test_y <- spam$label[-index]

## 将每个词项所代表的特征转化为对应单词的因子变量
train_x <- apply(train_x, 2, function(x) as.factor(ifelse(x>0,1,0)))
test_x <- apply(test_x, 2, function(x) as.factor(ifelse(x>0,1,0)))
## 使用e1071包中的naiveBayes建立模型
spamnb <- naiveBayes(x = train_x,y = train_y,laplace = 1)
summary(spamnb)

##           Length Class  Mode     
## apriori      2   table  numeric  
## tables    1252   -none- list     
## levels       2   -none- character
## isnumeric 1252   -none- logical  
## call         4   -none- call

## 对测试集进行预测，查看模型的精度
test_pre <- predict(spamnb,test_x,type = "class")

CrossTable(test_y,test_pre,prop.r = F,prop.t = F,prop.chisq = F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  1393 
## 
##  
##              | test_pre 
##       test_y |       ham |      spam | Row Total | 
## -------------|-----------|-----------|-----------|
##          ham |      1203 |         3 |      1206 | 
##              |     0.985 |     0.017 |           | 
## -------------|-----------|-----------|-----------|
##         spam |        18 |       169 |       187 | 
##              |     0.015 |     0.983 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |      1221 |       172 |      1393 | 
##              |     0.877 |     0.123 |           | 
## -------------|-----------|-----------|-----------|
## 
##

table(test_y,test_pre)

##       test_pre
## test_y  ham spam
##   ham  1203    3
##   spam   18  169

sprintf("朴素贝叶斯的识别精度为%.4f",accuracy(test_y,test_pre))

## [1] "朴素贝叶斯的识别精度为0.9849"

pred <- prediction(as.integer(test_pre),as.integer(test_y))
par(pty=c("s"))
plot(performance(pred,"tpr","fpr"),main = "Naive Bayes")

## 使用naivebayes包中的naive_bayes建立模型
# spamnb2 <- naive_bayes(x = train_x,y = train_y,laplace = 1)
# summary(spamnb2)
## 对测试集进行预测，查看模型的精度
# test_pre2 <- predict(spamnb2,test_x,type = "class")
# table(test_pre2,test_y)
# accuracy(test_y,test_pre2)