第6章: 多元统计分析

theme_set(theme_bw(base_family = “STKaiti”))

6.1：主成分分析

主成分分析在图像数据上的应用

使用主成分分析对图像数据降维，可视化图像的分布位置，

对图像的样本量可视化降维，可视化图像的特征图像

## 主成分分析
library(R.matlab)

## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.

## 
## Attaching package: 'R.matlab'

## The following objects are masked from 'package:base':
## 
##     getOption, isOpen

ETHdata <- readMat("data/chap6/ETH_8class_object_8_big_classes_32_32_1024D.mat")

ETHims <- ETHdata$A / 255.0
dim(ETHims)

## [1] 1024 3280

## 可视化部分样本的图像
set.seed(123)
index <- sample(ncol(ETHims),40)
par(mfrow = c(5,8),mai=c(0.05,0.05,0.05,0.05))
for(ii in seq_along(index)){
  im <- matrix(ETHims[,index[ii]],nrow=32,ncol = 32,byrow = TRUE)
  image(im,col = gray(seq(0, 1, length = 256)),xaxt= "n", yaxt= "n")
}

对每个特征进行主成分分析，找到样本在二维空间的位置

## 对数据主成分分析，可视化部分图像的位置
ETHlocal <- t(ETHims)
dim(ETHlocal)

## [1] 3280 1024

ETHpca1 <- princomp(ETHlocal)

## 找到每个样本的前两个主成分的坐标
local <- ETHpca1$scores[,1:2]
dim(local)

## [1] 3280    2

## 为了防止遮挡，随机的挑选1000张图片进行可视化
set.seed(123)
index <- sample(nrow(local),1000)
localind <- local[index,]
x <- localind[,1]
y <- localind[,2]
ETHimsindex <- ETHims[,index]

##  设置图像的宽和高
width = 0.015*diff(range(x))
height = 0.03*diff(range(y))
## 可视化图像
plot(x,y, t="n",xlab = "PCA 1",ylab = "PCA 2")
for (ii in seq_along(localind[,1])){
  imii <- matrix(ETHimsindex[,ii],nrow=32,ncol = 32,byrow = TRUE)
  rasterImage(imii, xleft=x[ii] - 0.5*width,
              ybottom= y[ii] - 0.5*height,
              xright=x[ii] + 0.5*width, 
              ytop= y[ii] + 0.5*height, interpolate=FALSE)
  }

## 局部放大图
plot(x,y, t="n",xlim = c(-8,0),ylim = c(-2,4),
     xlab = "PCA 1",ylab = "PCA 2")
for (ii in seq_along(localind[,1])){
  imii <- matrix(ETHimsindex[,ii],nrow=32,ncol = 32,byrow = TRUE)
  rasterImage(imii, xleft=x[ii] - 0.5*width,
              ybottom= y[ii] - 0.5*height,
              xright=x[ii] + 0.5*width, 
              ytop= y[ii] + 0.5*height, interpolate=FALSE)
  }

对每个样本进行主成分分析，找到所有的样本的特征图像

library(psych)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

dim(ETHims)

## [1] 1024 3280

## 随机挑选800张图像，为了使feature<样本数
set.seed(1234)
index <- sample(ncol(ETHims),800)
##  每类约有100张图像
table(ETHdata$labels[index])

## 
##   1   2   3   4   5   6   7   8 
## 102 112 109  85  96  83 108 105

ETHimsample<- ETHims[,index]
dim(ETHimsample)

## [1] 1024  800

## 可视化碎石图，选择合适的主成分数
parpca <- fa.parallel(ETHimsample,fa = "pc")

## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.

## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  30

## 可视化碎石图的部分图像
pcanum <- 50
plotdata <- data.frame(x = 1:pcanum,pc.values = parpca$pc.values[1:pcanum])
ggplot(plotdata,aes(x = x,y = pc.values))+
  theme_bw(base_family = "STKaiti")+
  geom_point(colour = "red")+geom_line(colour = "blue")+
  labs(x = "主成分个数")

## 主成分分析,提取前30个主成分
ETHcor <- cor(ETHimsample,ETHimsample)
dim(ETHcor)

## [1] 800 800

ETHpca2 <- principal(ETHcor,nfactors = 30)

## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.

## 使用pca模型获取数据集的30个主成分
ETHpca_im<- predict.psych(ETHpca2,ETHimsample)
## 可视化这些主成分
par(mfrow = c(5,6),mai=c(0.05,0.05,0.05,0.05))
for(ii in seq_along(1:30)){
  im <- matrix(ETHpca_im[,ii],nrow=32,ncol = 32,byrow = TRUE)
  image(im,col = gray(seq(0, 1, length = 128)),xaxt= "n", yaxt= "n")
}

6.2：聚类分析

系统聚类

library(ggplot2)
library(gridExtra)
library(ggdendro)
library(cluster)
library(ggfortify)


## 系统聚类,鸢尾花数据集
iris <- read.csv("data/chap6/Iris.csv")
## 调整数据的类别标签
iris$Species <- stringr::str_replace(iris$Species,"Iris-","")
iris4 <- iris[,2:5]
str(iris4)

## 'data.frame':    150 obs. of  4 variables:
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...

iris_scale <- scale(iris4)

## 系统聚类及可视化
hc1 <- hclust(dist(iris_scale),method = "ward.D2")
hc1$labels <- paste(iris$Species,1:150,sep = "-")
##  可视化结果
par(family = "STKaiti",cex = 0.45)
plot(hc1,hang = -1)
rect.hclust(hc1, k=3, border="red")

ggdendrogram(hc1, segments = T,rotate = F, theme_dendro = FALSE,size = 4)+
  theme_bw()+theme(axis.text.x = element_text(size = 5,angle = 90))

k-means聚类

## k-means聚类,鸢尾花数据集
## 计算组内平方和  组间平方和
tot_withinss <- vector()
betweenss <- vector()
for(ii in 1:15){
  k1 <- kmeans(iris_scale,ii)
  tot_withinss[ii] <- k1$tot.withinss
  betweenss[ii] <- k1$betweenss
}

kmeanvalue <- data.frame(kk = 1:15,
                         tot_withinss = tot_withinss,
                         betweenss = betweenss)


p1 <- ggplot(kmeanvalue,aes(x = kk,y = tot_withinss))+
  theme_bw(base_family = "STKaiti")+
  geom_point() + geom_line() +labs(y = "value") +
  ggtitle("Total within-cluster sum of squares")+
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_continuous("kmean 聚类个数",kmeanvalue$kk)

p2 <- ggplot(kmeanvalue,aes(x = kk,y = betweenss))+
  theme_bw(base_family = "STKaiti")+
  geom_point() +geom_line() +labs(y = "value") +
  ggtitle("The between-cluster sum of squares") +
  theme(plot.title = element_text(hjust = 0.5))+
  scale_x_continuous("kmean 聚类个数",kmeanvalue$kk)

grid.arrange(p1,p2,nrow=2)

## 可以发现选择聚类数目为3比较合适
## 随着聚类个数的增加，组内平方和在减少，组间平方和在增加，根据图和数据的背景可将数据聚类为三类。
set.seed(245)
k3 <- kmeans(iris_scale,3)
summary(k3)

##              Length Class  Mode   
## cluster      150    -none- numeric
## centers       12    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric

k3

## K-means clustering with 3 clusters of sizes 47, 50, 53
## 
## Cluster means:
##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1    1.13217737    0.0962759     0.9929445    1.0137756
## 2   -1.01119138    0.8394944    -1.3005215   -1.2509379
## 3   -0.05005221   -0.8773526     0.3463713    0.2811215
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 3 3 3 1 3 3 3 3 3 3 3 3 1 3 3 3 3 1 3 3 3
##  [75] 3 1 1 1 3 3 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1
## [112] 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 1 1 1 1 1 1 3 3 1 1 1 3 1 1 1 3 1 1 1 3 1
## [149] 1 3
## 
## Within cluster sum of squares by cluster:
## [1] 47.60995 48.15831 44.25778
##  (between_SS / total_SS =  76.5 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

table(k3$cluster)

## 
##  1  2  3 
## 47 50 53

## 对聚类结果可视化
clusplot(iris_scale,k3$cluster,main = "kmean cluster number=3")

## 可视化轮廓图，表示聚类效果
sis1 <- silhouette(k3$cluster,dist(iris_scale,method = "euclidean"))


plot(sis1,main = "Iris kmean silhouette",
     col = c("red", "green", "blue"))

密度聚类

##  使用双月数据集和圆形数据集

library(fpc)
## 使用DBSCAN算法进行数据聚类
# 读取数据
moondata <- read.csv("data/chap6/moonsdatas.csv")
moondata$Y <- as.factor(moondata$Y)
str(moondata)

## 'data.frame':    200 obs. of  3 variables:
##  $ X1: num  0.742 1.744 1.693 0.74 -0.378 ...
##  $ X2: num  0.5856 0.0391 -0.1906 0.6393 0.9748 ...
##  $ Y : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 2 1 ...

## 可视化数据的情况
ggplot(moondata,aes(x = X1,y = X2,shape = Y))+
  theme_bw()+geom_point()

# 用fpc包中的dbscan函数进行密度聚类

model1 <- dbscan(moondata[,1:2],eps=0.05,MinPts=5)
## 聚类结果
model1

## dbscan Pts=200 MinPts=5 eps=0.05
##          0 1
## border 195 4
## seed     0 1
## total  195 5

table(model1$cluster)

## 
##   0   1 
## 195   5

## 可视化
moondata$clu <- model1$cluster
ggplot(moondata,aes(x = X1,y = X2,shape = as.factor(clu)))+
    theme_bw()+geom_point()+theme(legend.position = c(0.8,0.8))

## 可视化在不同的eps情况下，聚类的情况
eps <- c(0.05,0.06,0.25,0.3)
name <- c("one","two","three","four")
dbdata <- moondata[,1:2]
for (ii in 1:length(eps)) {
  modeli <- dbscan(dbdata[,1:2],eps=eps[ii],MinPts=5)
  dbdata[[name[ii]]] <- as.factor(modeli$cluster)

}

head(dbdata)

##           X1          X2 one two three four
## 1  0.7424201  0.58556710   0   0     1    1
## 2  1.7444393  0.03909624   0   0     2    1
## 3  1.6934791 -0.19061851   0   0     2    1
## 4  0.7395695  0.63927458   0   0     1    1
## 5 -0.3780247  0.97481407   0   0     1    1
## 6  0.8943966  0.26841801   0   0     1    1

p1<- ggplot(dbdata,aes(x = X1,y = X2,shape = one,colour = one))+
  theme_bw(base_size = 8)+geom_point()+
  theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.05,MinPts=5")
p2<- ggplot(dbdata,aes(x = X1,y = X2,shape = two,colour = two))+
  theme_bw(base_size = 8)+geom_point()+
  theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.06,MinPts=5")
p3<- ggplot(dbdata,aes(x = X1,y = X2,shape = three,colour = three))+
  theme_bw(base_size = 8)+geom_point()+
  theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.2,MinPts=5")
p4<- ggplot(dbdata,aes(x = X1,y = X2,shape = four,colour = four))+
  theme_bw(base_size = 8)+geom_point()+
  theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.3,MinPts=5")

grid.arrange(p1,p2,p3,p4,nrow = 2)

6.3：对应分析

## 对应分析研究两个分类变量之间详细的依赖关系
library(ca)

## smoke数据集包含虚构公司中员工组（高级经理，初级经理，高级员工，初级员工和秘书）(senior managers, junior managers, senior employees, junior employees and secretaries)的吸烟习惯（无，轻，中，重）(none, light, medium and heavy)的频率。

data("smoke")

## 卡方检验判断两个变量是否独立
(result <- chisq.test(smoke))

## Warning in chisq.test(smoke): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  smoke
## X-squared = 16.442, df = 12, p-value = 0.1718

 ## p-value = 0.1718 > 0.05, 说明不独立


## 使用马赛克图进行可视化数据
par(family = "STKaiti")
mosaicplot(smoke,main = "",color = c("red","blue","green","orange"))

## 数据中 JM，jE的中度吸烟者最多，SE中更多的none不吸烟，事实是否是这样，可以使用对应分析
 

## 对应分析分析
smca <- ca(smoke)
summary(smca)

## 
## Principal inertias (eigenvalues):
## 
##  dim    value      %   cum%   scree plot               
##  1      0.074759  87.8  87.8  **********************   
##  2      0.010017  11.8  99.5  ***                      
##  3      0.000414   0.5 100.0                           
##         -------- -----                                 
##  Total: 0.085190 100.0                                 
## 
## 
## Rows:
##     name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1 |   SM |   57  893   31 |  -66  92   3 | -194 800 214 |
## 2 |   JM |   93  991  139 |  259 526  84 | -243 465 551 |
## 3 |   SE |  264 1000  450 | -381 999 512 |  -11   1   3 |
## 4 |   JE |  456 1000  308 |  233 942 331 |   58  58 152 |
## 5 |   SC |  130  999   71 | -201 865  70 |   79 133  81 |
## 
## Columns:
##     name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1 | none |  316 1000  577 | -393 994 654 |  -30   6  29 |
## 2 | lght |  233  984   83 |   99 327  31 |  141 657 463 |
## 3 | medm |  321  983  148 |  196 982 166 |    7   1   2 |
## 4 | hevy |  130  995  192 |  294 684 150 | -198 310 506 |

plot(smca,main = "smoke data")

三维列联表的对应分析

library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(FactoMineR)
## 1973年按入学和性别划分的伯克利大学研究生院申请人数最多的6个学院的汇总数据。
data("UCBAdmissions")
mca <- mjca(UCBAdmissions)
summary(mca)

## 
## Principal inertias (eigenvalues):
## 
##  dim    value      %   cum%   scree plot               
##  1      0.114945  80.5  80.5  ************************ 
##  2      0.005694   4.0  84.5  *                        
##  3      00000000   0.0  84.5                           
##  4      00000000   0.0  84.5                           
##  5      00000000   0.0  84.5                           
##         -------- -----                                 
##  Total: 0.142840                                       
## 
## 
## Columns:
##                name   mass  qlt  inr    k=1 cor ctr    k=2 cor ctr  
## 1  | Admit:Admitted |  129  911   93 |  365 875 150 |   74  36 123 |
## 2  | Admit:Rejected |  204  911   59 | -231 875  95 |  -47  36  78 |
## 3  |  Gender:Female |  135  863   95 | -399 845 187 |   59  19  84 |
## 4  |    Gender:Male |  198  863   65 |  272 845 127 |  -40  19  57 |
## 5  |         Dept:A |   69  838  117 |  512 837 156 |   13   1   2 |
## 6  |         Dept:B |   43  829  124 |  573 824 123 |  -45   5  15 |
## 7  |         Dept:C |   68  731  108 | -270 594  43 |  130 137 199 |
## 8  |         Dept:D |   58  832  106 | -110 828   6 |    7   3   0 |
## 9  |         Dept:E |   43  812  117 | -384 787  55 |   69  25  35 |
## 10 |         Dept:F |   53  737  116 | -355 547  58 | -210 190 406 |

par(family = "STKaiti")
plot(mca, mass = c(TRUE, TRUE),col = c("black","red","green","blue"),
     main = "三维列联表对应分析")

6.4：典型相关分析

library(ade4)

## 
## Attaching package: 'ade4'

## The following object is masked from 'package:FactoMineR':
## 
##     reconst

library(CCA)

## Loading required package: fda

## Loading required package: splines

## Loading required package: Matrix

## 
## Attaching package: 'fda'

## The following object is masked from 'package:graphics':
## 
##     matplot

## Loading required package: fields

## Loading required package: spam

## Loading required package: dotCall64

## Loading required package: grid

## Spam version 2.5-1 (2019-12-12) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.

## 
## Attaching package: 'spam'

## The following object is masked from 'package:Matrix':
## 
##     det

## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve

## Loading required package: maps

## 
## Attaching package: 'maps'

## The following object is masked from 'package:cluster':
## 
##     votes.repub

## See https://github.com/NCAR/Fields for
##  an extensive vignette, other supplements and source code

## 
## Attaching package: 'fields'

## The following object is masked from 'package:ggfortify':
## 
##     unscale

## The following object is masked from 'package:psych':
## 
##     describe

library(candisc)

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## Loading required package: heplots

## 
## Attaching package: 'candisc'

## The following object is masked from 'package:stats':
## 
##     cancor

## 分析两组变量之间的相关性
## This data set gives the performances of 33 men's decathlon at the Olympic Games (1988).
data(olympic)
## is a data frame with 33 rows and 10 columns events of the decathlon: 100 meters (100), long jump (long), shotput (poid), high jump (haut), 400 meters (400), 110-meter hurdles (110), discus throw (disq), pole vault (perc), javelin (jave) and 1500 meters (1500).
olytab <- olympic$tab
summary(olytab)

##       100             long            poid            haut      
##  Min.   :10.62   Min.   :6.220   Min.   :10.27   Min.   :1.790  
##  1st Qu.:11.02   1st Qu.:7.000   1st Qu.:13.15   1st Qu.:1.940  
##  Median :11.18   Median :7.090   Median :14.12   Median :1.970  
##  Mean   :11.20   Mean   :7.133   Mean   :13.98   Mean   :1.983  
##  3rd Qu.:11.43   3rd Qu.:7.370   3rd Qu.:14.97   3rd Qu.:2.030  
##  Max.   :11.57   Max.   :7.720   Max.   :16.60   Max.   :2.270  
##       400             110             disq            perc      
##  Min.   :47.44   Min.   :14.18   Min.   :34.36   Min.   :4.000  
##  1st Qu.:48.34   1st Qu.:14.72   1st Qu.:39.08   1st Qu.:4.600  
##  Median :49.15   Median :15.00   Median :42.32   Median :4.700  
##  Mean   :49.28   Mean   :15.05   Mean   :42.35   Mean   :4.739  
##  3rd Qu.:49.98   3rd Qu.:15.38   3rd Qu.:44.80   3rd Qu.:4.900  
##  Max.   :51.28   Max.   :16.20   Max.   :50.66   Max.   :5.700  
##       jave            1500      
##  Min.   :49.52   Min.   :256.6  
##  1st Qu.:55.42   1st Qu.:266.4  
##  Median :59.48   Median :272.1  
##  Mean   :59.44   Mean   :276.0  
##  3rd Qu.:64.00   3rd Qu.:286.0  
##  Max.   :72.60   Max.   :303.2

head(olytab)

##     100 long  poid haut   400   110  disq perc  jave   1500
## 1 11.25 7.43 15.48 2.27 48.90 15.13 49.28  4.7 61.32 268.95
## 2 10.87 7.45 14.97 1.97 47.71 14.46 44.36  5.1 61.76 273.02
## 3 11.18 7.44 14.20 1.97 48.29 14.81 43.66  5.2 64.16 263.20
## 4 10.62 7.38 15.02 2.03 49.06 14.72 44.80  4.9 64.04 285.11
## 5 11.02 7.43 12.92 1.97 47.44 14.40 41.20  5.2 57.46 256.64
## 6 10.83 7.72 13.58 2.12 48.34 14.18 43.06  4.9 52.18 274.07

## 数据标准化

olytab_s <- as.data.frame(scale(olytab))

## 切分数据，分别为上肢相关和下肢相关的项目
## X: shotput (poid), discus throw (disq), javelin (jave), pole vault (perc) ;; arm
## Y: run100, run400, run1500, hurdle, long.jump, high.jump  ;; leg
xname <- c("poid","disq","jave","perc")
yname <- c("100","long","haut","400","110","1500")
olytab_sX <- olytab_s[,xname]
olytab_sY <- olytab_s[,yname]

##  典型相关分析
olycca <- candisc::cancor(olytab_sX, olytab_sY)
summary(olycca)

## 
## Canonical correlation analysis of:
##   4   X  variables:  poid, disq, jave, perc 
##   with    6   Y  variables:  100, long, haut, 400, 110, 1500 
## 
##     CanR  CanRSQ   Eigen percent    cum                          scree
## 1 0.5866 0.34411 0.52464   47.87  47.87 ******************************
## 2 0.4852 0.23540 0.30788   28.09  75.96 ******************            
## 3 0.3991 0.15927 0.18944   17.28  93.24 ***********                   
## 4 0.2626 0.06898 0.07409    6.76 100.00 ****                          
## 
## Test of H0: The canonical correlations in the 
## current row and all that follow are zero
## 
##      CanR LR test stat approx F numDF  denDF Pr(> F)
## 1 0.58661      0.39254  1.04327    24 81.447  0.4252
## 2 0.48518      0.59848  0.90820    15 66.655  0.5590
## 3 0.39909      0.78273  0.81436     8 50.000  0.5934
## 4 0.26265      0.93102  0.64215     3 26.000  0.5948
## 
## Raw canonical coefficients
## 
##    X  variables: 
##         Xcan1      Xcan2     Xcan3     Xcan4
## poid  0.94660 -0.6088535  1.485529  0.770784
## disq -0.68111  1.4055079 -0.677713 -0.021927
## jave -0.28437  0.0074946  0.076189 -1.216793
## perc  0.73503  0.0662211 -0.836913 -0.253155
## 
##    Y  variables: 
##          Ycan1     Ycan2     Ycan3    Ycan4
## 100  -0.251053  0.080169 -0.501680 -0.10119
## long  0.089086  0.289806  0.047488 -1.06577
## haut -0.092248  0.418897 -0.317659 -0.25448
## 400   0.254005 -0.469382  1.461976 -0.39689
## 110  -0.929616  0.278657 -0.341989 -0.40049
## 1500 -0.045226  1.202536 -0.294172  0.20546

par(mfrow = c(2,2))
plot(olycca,which = 1)
plot(olycca,which = 2)
plot(olycca,which = 3)
plot(olycca,which = 4)

## 可视化典型相关分析的相关系数
olycca <- matcor(olytab_sX, olytab_sY)
img.matcor(olycca,type = 2)

6.5：判别分析

library(MASS)
library(klaR)
data("iris")
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

## 选择100个样本作为训练集，其余的作为测试集
set.seed(223)
index <- sample(nrow(iris),100)
iris_train <- iris[index,]
iris_test <- iris[-index,]

table(iris_train$Species)

## 
##     setosa versicolor  virginica 
##         33         32         35

table(iris_test$Species)

## 
##     setosa versicolor  virginica 
##         17         18         15

## 使用线性判别
irislda <- lda(Species~.,data = iris_train)
irislda

## Call:
## lda(Species ~ ., data = iris_train)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##       0.33       0.32       0.35 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa         5.084848    3.524242     1.496970   0.2545455
## versicolor     5.903125    2.753125     4.196875   1.3125000
## virginica      6.602857    2.951429     5.528571   1.9800000
## 
## Coefficients of linear discriminants:
##                    LD1        LD2
## Sepal.Length  1.217393  0.1726320
## Sepal.Width   1.220191 -2.8811341
## Petal.Length -2.379484  0.1012736
## Petal.Width  -3.102694 -1.5076948
## 
## Proportion of trace:
##    LD1    LD2 
## 0.9906 0.0094

## 预测测试集
irisldapre <- predict(irislda,iris_test)
table(iris_test$Species,irisldapre$class)

##             
##              setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         16         2
##   virginica       0          0        15

## 可以发现只有两个样本预测错误
plot(irislda,abbrev = 1)

## 使用klaR包中的partimat函数探索可视化判别分析的效果
## 线性判别分析
par(family = "STKaiti")
partimat(Species~.,data = iris_train,method="lda",
         main = "线性判别")

## 二次判别分析
par(family = "STKaiti")
partimat(Species~.,data = iris_train,method="qda",
         main = "二次判别")

## 使用二次判别
irisqda <- qda(Species~.,data = iris_train)
irisqda

## Call:
## qda(Species ~ ., data = iris_train)
## 
## Prior probabilities of groups:
##     setosa versicolor  virginica 
##       0.33       0.32       0.35 
## 
## Group means:
##            Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa         5.084848    3.524242     1.496970   0.2545455
## versicolor     5.903125    2.753125     4.196875   1.3125000
## virginica      6.602857    2.951429     5.528571   1.9800000

## 预测测试集
irisqdapre <- predict(irisqda,iris_test)
table(iris_test$Species,irisqdapre$class)

##             
##              setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         16         2
##   virginica       0          0        15

## 可以发现识别精度和线性判别分析的结果一样

6.6：关联规则分析

library(readr)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:car':
## 
##     recode

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(arules)

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)

## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus

## 分析购物篮数据等

## 读取数据
groupdata <- read_csv("data/chap6/dataset_group.csv",col_names = F)

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_character()
## )

summary(groupdata)

##        X1              X2           
##  Min.   :   1.0   Length:22343      
##  1st Qu.: 292.0   Class :character  
##  Median : 582.0   Mode  :character  
##  Mean   : 576.4                     
##  3rd Qu.: 863.0                     
##  Max.   :1139.0

## 数据一共包含约38种商品,共有1139条购买记录
unique(groupdata$X2)

##  [1] "yogurt"                       "pork"                        
##  [3] "sandwich bags"                "lunch meat"                  
##  [5] "all- purpose"                 "flour"                       
##  [7] "soda"                         "butter"                      
##  [9] "vegetables"                   "beef"                        
## [11] "aluminum foil"                "dinner rolls"                
## [13] "shampoo"                      "mixes"                       
## [15] "soap"                         "laundry detergent"           
## [17] "ice cream"                    "toilet paper"                
## [19] "hand soap"                    "waffles"                     
## [21] "cheeses"                      "milk"                        
## [23] "dishwashing liquid/detergent" "individual meals"            
## [25] "cereals"                      "tortillas"                   
## [27] "spaghetti sauce"              "ketchup"                     
## [29] "sandwich loaves"              "poultry"                     
## [31] "bagels"                       "eggs"                        
## [33] "juice"                        "pasta"                       
## [35] "paper towels"                 "coffee/tea"                  
## [37] "fruits"                       "sugar"

length(unique(groupdata$X1))

## [1] 1139

## 查看每样商品在数据中出现的次数
items_unm <- groupdata %>%
  group_by(X2)%>%
  summarise(num = n())

ggplot(items_unm,aes(x = reorder(X2,num),y = num)) +
  theme_bw(base_family = "STKaiti",base_size = 10) +
  geom_bar(stat = "identity",fill = "lightblue") +
  labs(x = "商品",y = "商品出现次数") + 
  coord_flip() +
  geom_text(aes(x = reorder(X2,num),y = num + 50,label = num),size = 3)

# 数据表转化为list
buy_data<- split(x=groupdata$X2,f=as.factor(groupdata$X1))
# 查看一共有多少个实例
sum(sapply(buy_data,length))

## [1] 22343

# 过滤掉每个购买记录中相同的实例
buy_data <- lapply(buy_data,unique)
sum(sapply(buy_data,length))

## [1] 16753

## 转化为（"transactions"）交易数据集
buy_data <- as(buy_data,"transactions")


## 1:可视化频繁项集
## 出现的频率大于0.25的项目
par(family = "STKaiti",cex = 0.7)
itemFrequencyPlot(buy_data,support = 0.25,col = "lightblue",
                  xlab = "频繁项目",ylab = "项目频率",
                  main = "频率>0.25的项目")

## 可视化top20的项目
par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(buy_data,top = 20,col = "lightblue",
                  xlab = "频繁项目",ylab = "项目频率",
                  main = "Top20的项目")

## 找到规则
myrule <- apriori(data = buy_data,
                  parameter = list(support = 0.25,
                                   confidence = 0.4,
                                   minlen = 1))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.25      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 284 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[38 item(s), 1139 transaction(s)] done [0.00s].
## sorting and recoding items ... [38 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [57 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

## 找到了57个规则
summary(myrule)

## set of 57 rules
## 
## rule length distribution (lhs + rhs):sizes
##  1  2 
##  2 55 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   1.965   2.000   2.000 
## 
## summary of quality measures:
##     support         confidence          lift           count      
##  Min.   :0.2704   Min.   :0.4002   Min.   :1.000   Min.   :308.0  
##  1st Qu.:0.2915   1st Qu.:0.4216   1st Qu.:1.053   1st Qu.:332.0  
##  Median :0.3003   Median :0.7728   Median :1.065   Median :342.0  
##  Mean   :0.3107   Mean   :0.6631   Mean   :1.066   Mean   :353.9  
##  3rd Qu.:0.3108   3rd Qu.:0.7875   3rd Qu.:1.078   3rd Qu.:354.0  
##  Max.   :0.7392   Max.   :0.8378   Max.   :1.133   Max.   :842.0  
## 
## mining info:
##      data ntransactions support confidence
##  buy_data          1139    0.25        0.4

## 关联分析2，指定项目的右侧出现的项目，找到频繁的规则
## 这次主要挖掘指定右项集为"ice cream"
myrule2 <- apriori(buy_data,   #数据集
                   parameter = list(minlen =3,  # 频数项集长度
                                    maxlen = 8,# 项集的最大长度
                                    supp = 0.1, ## 支持度阈值
                                    conf = 0.45,  ## 置信度阈值
                                    target = "rules"),
                   ## 设定右边项集只能出现"ice creame","fruits"，
                   ## 左项集默认参数
                   appearance = list(rhs=c("ice cream"),
                                     default="lhs"))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.45    0.1    1 none FALSE            TRUE       5     0.1      3
##  maxlen target   ext
##       8  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 113 
## 
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[38 item(s), 1139 transaction(s)] done [0.00s].
## sorting and recoding items ... [38 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [10 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(myrule2)

## set of 10 rules
## 
## rule length distribution (lhs + rhs):sizes
##  3 
## 10 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       3       3       3       3       3       3 
## 
## summary of quality measures:
##     support         confidence          lift           count      
##  Min.   :0.1291   Min.   :0.4523   Min.   :1.135   Min.   :147.0  
##  1st Qu.:0.1328   1st Qu.:0.4556   1st Qu.:1.143   1st Qu.:151.2  
##  Median :0.1343   Median :0.4675   Median :1.173   Median :153.0  
##  Mean   :0.1371   Mean   :0.4644   Mean   :1.165   Mean   :156.2  
##  3rd Qu.:0.1431   3rd Qu.:0.4700   3rd Qu.:1.179   3rd Qu.:163.0  
##  Max.   :0.1466   Max.   :0.4758   Max.   :1.194   Max.   :167.0  
## 
## mining info:
##      data ntransactions support confidence
##  buy_data          1139     0.1       0.45

## 探索更详细的规则信息,将得到的规则按照提升度进行排序
myrule2_sortl <- sort(myrule2,by = "lift")
inspect(myrule2_sortl)

##      lhs                              rhs         support   confidence lift    
## [1]  {paper towels,vegetables}     => {ice cream} 0.1378402 0.4757576  1.193586
## [2]  {sandwich loaves,vegetables}  => {ice cream} 0.1343284 0.4751553  1.192075
## [3]  {lunch meat,vegetables}       => {ice cream} 0.1466198 0.4704225  1.180201
## [4]  {aluminum foil,vegetables}    => {ice cream} 0.1457419 0.4689266  1.176448
## [5]  {cheeses,vegetables}          => {ice cream} 0.1448639 0.4687500  1.176005
## [6]  {pasta,vegetables}            => {ice cream} 0.1334504 0.4662577  1.169752
## [7]  {fruits,vegetables}           => {ice cream} 0.1325724 0.4561934  1.144503
## [8]  {soap,vegetables}             => {ice cream} 0.1343284 0.4553571  1.142405
## [9]  {beef,vegetables}             => {ice cream} 0.1325724 0.4548193  1.141055
## [10] {individual meals,vegetables} => {ice cream} 0.1290606 0.4523077  1.134754
##      count
## [1]  157  
## [2]  153  
## [3]  167  
## [4]  166  
## [5]  165  
## [6]  152  
## [7]  151  
## [8]  153  
## [9]  151  
## [10] 147

## 可视化获取得到的规则
plot(myrule2, method="graph")

plot(myrule2, method="grouped")