使用主成分分析对图像数据降维,可视化图像的分布位置,
对图像的样本量可视化降维,可视化图像的特征图像
## 主成分分析
library(R.matlab)
## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.
##
## Attaching package: 'R.matlab'
## The following objects are masked from 'package:base':
##
## getOption, isOpen
ETHdata <- readMat("data/chap6/ETH_8class_object_8_big_classes_32_32_1024D.mat")
ETHims <- ETHdata$A / 255.0
dim(ETHims)
## [1] 1024 3280
## 可视化部分样本的图像
set.seed(123)
index <- sample(ncol(ETHims),40)
par(mfrow = c(5,8),mai=c(0.05,0.05,0.05,0.05))
for(ii in seq_along(index)){
im <- matrix(ETHims[,index[ii]],nrow=32,ncol = 32,byrow = TRUE)
image(im,col = gray(seq(0, 1, length = 256)),xaxt= "n", yaxt= "n")
}
## 对数据主成分分析,可视化部分图像的位置
ETHlocal <- t(ETHims)
dim(ETHlocal)
## [1] 3280 1024
ETHpca1 <- princomp(ETHlocal)
## 找到每个样本的前两个主成分的坐标
local <- ETHpca1$scores[,1:2]
dim(local)
## [1] 3280 2
## 为了防止遮挡,随机的挑选1000张图片进行可视化
set.seed(123)
index <- sample(nrow(local),1000)
localind <- local[index,]
x <- localind[,1]
y <- localind[,2]
ETHimsindex <- ETHims[,index]
## 设置图像的宽和高
width = 0.015*diff(range(x))
height = 0.03*diff(range(y))
## 可视化图像
plot(x,y, t="n",xlab = "PCA 1",ylab = "PCA 2")
for (ii in seq_along(localind[,1])){
imii <- matrix(ETHimsindex[,ii],nrow=32,ncol = 32,byrow = TRUE)
rasterImage(imii, xleft=x[ii] - 0.5*width,
ybottom= y[ii] - 0.5*height,
xright=x[ii] + 0.5*width,
ytop= y[ii] + 0.5*height, interpolate=FALSE)
}
## 局部放大图
plot(x,y, t="n",xlim = c(-8,0),ylim = c(-2,4),
xlab = "PCA 1",ylab = "PCA 2")
for (ii in seq_along(localind[,1])){
imii <- matrix(ETHimsindex[,ii],nrow=32,ncol = 32,byrow = TRUE)
rasterImage(imii, xleft=x[ii] - 0.5*width,
ybottom= y[ii] - 0.5*height,
xright=x[ii] + 0.5*width,
ytop= y[ii] + 0.5*height, interpolate=FALSE)
}
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
dim(ETHims)
## [1] 1024 3280
## 随机挑选800张图像,为了使feature<样本数
set.seed(1234)
index <- sample(ncol(ETHims),800)
## 每类约有100张图像
table(ETHdata$labels[index])
##
## 1 2 3 4 5 6 7 8
## 102 112 109 85 96 83 108 105
ETHimsample<- ETHims[,index]
dim(ETHimsample)
## [1] 1024 800
## 可视化碎石图,选择合适的主成分数
parpca <- fa.parallel(ETHimsample,fa = "pc")
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.
## Parallel analysis suggests that the number of factors = NA and the number of components = 30
## 可视化碎石图的部分图像
pcanum <- 50
plotdata <- data.frame(x = 1:pcanum,pc.values = parpca$pc.values[1:pcanum])
ggplot(plotdata,aes(x = x,y = pc.values))+
theme_bw(base_family = "STKaiti")+
geom_point(colour = "red")+geom_line(colour = "blue")+
labs(x = "主成分个数")
## 主成分分析,提取前30个主成分
ETHcor <- cor(ETHimsample,ETHimsample)
dim(ETHcor)
## [1] 800 800
ETHpca2 <- principal(ETHcor,nfactors = 30)
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined.
## Chi square is based upon observed residuals.
## The determinant of the smoothed correlation was zero.
## This means the objective function is not defined for the null model either.
## The Chi square is thus based upon observed correlations.
## 使用pca模型获取数据集的30个主成分
ETHpca_im<- predict.psych(ETHpca2,ETHimsample)
## 可视化这些主成分
par(mfrow = c(5,6),mai=c(0.05,0.05,0.05,0.05))
for(ii in seq_along(1:30)){
im <- matrix(ETHpca_im[,ii],nrow=32,ncol = 32,byrow = TRUE)
image(im,col = gray(seq(0, 1, length = 128)),xaxt= "n", yaxt= "n")
}
library(ggplot2)
library(gridExtra)
library(ggdendro)
library(cluster)
library(ggfortify)
## 系统聚类,鸢尾花数据集
iris <- read.csv("data/chap6/Iris.csv")
## 调整数据的类别标签
iris$Species <- stringr::str_replace(iris$Species,"Iris-","")
iris4 <- iris[,2:5]
str(iris4)
## 'data.frame': 150 obs. of 4 variables:
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
iris_scale <- scale(iris4)
## 系统聚类及可视化
hc1 <- hclust(dist(iris_scale),method = "ward.D2")
hc1$labels <- paste(iris$Species,1:150,sep = "-")
## 可视化结果
par(family = "STKaiti",cex = 0.45)
plot(hc1,hang = -1)
rect.hclust(hc1, k=3, border="red")
ggdendrogram(hc1, segments = T,rotate = F, theme_dendro = FALSE,size = 4)+
theme_bw()+theme(axis.text.x = element_text(size = 5,angle = 90))
## k-means聚类,鸢尾花数据集
## 计算组内平方和 组间平方和
tot_withinss <- vector()
betweenss <- vector()
for(ii in 1:15){
k1 <- kmeans(iris_scale,ii)
tot_withinss[ii] <- k1$tot.withinss
betweenss[ii] <- k1$betweenss
}
kmeanvalue <- data.frame(kk = 1:15,
tot_withinss = tot_withinss,
betweenss = betweenss)
p1 <- ggplot(kmeanvalue,aes(x = kk,y = tot_withinss))+
theme_bw(base_family = "STKaiti")+
geom_point() + geom_line() +labs(y = "value") +
ggtitle("Total within-cluster sum of squares")+
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous("kmean 聚类个数",kmeanvalue$kk)
p2 <- ggplot(kmeanvalue,aes(x = kk,y = betweenss))+
theme_bw(base_family = "STKaiti")+
geom_point() +geom_line() +labs(y = "value") +
ggtitle("The between-cluster sum of squares") +
theme(plot.title = element_text(hjust = 0.5))+
scale_x_continuous("kmean 聚类个数",kmeanvalue$kk)
grid.arrange(p1,p2,nrow=2)
## 可以发现选择聚类数目为3比较合适
## 随着聚类个数的增加,组内平方和在减少,组间平方和在增加,根据图和数据的背景可将数据聚类为三类。
set.seed(245)
k3 <- kmeans(iris_scale,3)
summary(k3)
## Length Class Mode
## cluster 150 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
k3
## K-means clustering with 3 clusters of sizes 47, 50, 53
##
## Cluster means:
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 1.13217737 0.0962759 0.9929445 1.0137756
## 2 -1.01119138 0.8394944 -1.3005215 -1.2509379
## 3 -0.05005221 -0.8773526 0.3463713 0.2811215
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 3 3 3 1 3 3 3 3 3 3 3 3 1 3 3 3 3 1 3 3 3
## [75] 3 1 1 1 3 3 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1
## [112] 1 1 3 3 1 1 1 1 3 1 3 1 3 1 1 3 1 1 1 1 1 1 3 3 1 1 1 3 1 1 1 3 1 1 1 3 1
## [149] 1 3
##
## Within cluster sum of squares by cluster:
## [1] 47.60995 48.15831 44.25778
## (between_SS / total_SS = 76.5 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
table(k3$cluster)
##
## 1 2 3
## 47 50 53
## 对聚类结果可视化
clusplot(iris_scale,k3$cluster,main = "kmean cluster number=3")
## 可视化轮廓图,表示聚类效果
sis1 <- silhouette(k3$cluster,dist(iris_scale,method = "euclidean"))
plot(sis1,main = "Iris kmean silhouette",
col = c("red", "green", "blue"))
## 使用双月数据集和圆形数据集
library(fpc)
## 使用DBSCAN算法进行数据聚类
# 读取数据
moondata <- read.csv("data/chap6/moonsdatas.csv")
moondata$Y <- as.factor(moondata$Y)
str(moondata)
## 'data.frame': 200 obs. of 3 variables:
## $ X1: num 0.742 1.744 1.693 0.74 -0.378 ...
## $ X2: num 0.5856 0.0391 -0.1906 0.6393 0.9748 ...
## $ Y : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 2 1 ...
## 可视化数据的情况
ggplot(moondata,aes(x = X1,y = X2,shape = Y))+
theme_bw()+geom_point()
# 用fpc包中的dbscan函数进行密度聚类
model1 <- dbscan(moondata[,1:2],eps=0.05,MinPts=5)
## 聚类结果
model1
## dbscan Pts=200 MinPts=5 eps=0.05
## 0 1
## border 195 4
## seed 0 1
## total 195 5
table(model1$cluster)
##
## 0 1
## 195 5
## 可视化
moondata$clu <- model1$cluster
ggplot(moondata,aes(x = X1,y = X2,shape = as.factor(clu)))+
theme_bw()+geom_point()+theme(legend.position = c(0.8,0.8))
## 可视化在不同的eps情况下,聚类的情况
eps <- c(0.05,0.06,0.25,0.3)
name <- c("one","two","three","four")
dbdata <- moondata[,1:2]
for (ii in 1:length(eps)) {
modeli <- dbscan(dbdata[,1:2],eps=eps[ii],MinPts=5)
dbdata[[name[ii]]] <- as.factor(modeli$cluster)
}
head(dbdata)
## X1 X2 one two three four
## 1 0.7424201 0.58556710 0 0 1 1
## 2 1.7444393 0.03909624 0 0 2 1
## 3 1.6934791 -0.19061851 0 0 2 1
## 4 0.7395695 0.63927458 0 0 1 1
## 5 -0.3780247 0.97481407 0 0 1 1
## 6 0.8943966 0.26841801 0 0 1 1
p1<- ggplot(dbdata,aes(x = X1,y = X2,shape = one,colour = one))+
theme_bw(base_size = 8)+geom_point()+
theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.05,MinPts=5")
p2<- ggplot(dbdata,aes(x = X1,y = X2,shape = two,colour = two))+
theme_bw(base_size = 8)+geom_point()+
theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.06,MinPts=5")
p3<- ggplot(dbdata,aes(x = X1,y = X2,shape = three,colour = three))+
theme_bw(base_size = 8)+geom_point()+
theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.2,MinPts=5")
p4<- ggplot(dbdata,aes(x = X1,y = X2,shape = four,colour = four))+
theme_bw(base_size = 8)+geom_point()+
theme(legend.position = c(0.8,0.8))+ggtitle("eps=0.3,MinPts=5")
grid.arrange(p1,p2,p3,p4,nrow = 2)
## 对应分析研究两个分类变量之间详细的依赖关系
library(ca)
## smoke数据集包含虚构公司中员工组(高级经理,初级经理,高级员工,初级员工和秘书)(senior managers, junior managers, senior employees, junior employees and secretaries)的吸烟习惯(无,轻,中,重)(none, light, medium and heavy)的频率。
data("smoke")
## 卡方检验判断两个变量是否独立
(result <- chisq.test(smoke))
## Warning in chisq.test(smoke): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: smoke
## X-squared = 16.442, df = 12, p-value = 0.1718
## p-value = 0.1718 > 0.05, 说明不独立
## 使用马赛克图进行可视化数据
par(family = "STKaiti")
mosaicplot(smoke,main = "",color = c("red","blue","green","orange"))
## 数据中 JM,jE的中度吸烟者最多,SE中更多的none不吸烟,事实是否是这样,可以使用对应分析
## 对应分析分析
smca <- ca(smoke)
summary(smca)
##
## Principal inertias (eigenvalues):
##
## dim value % cum% scree plot
## 1 0.074759 87.8 87.8 **********************
## 2 0.010017 11.8 99.5 ***
## 3 0.000414 0.5 100.0
## -------- -----
## Total: 0.085190 100.0
##
##
## Rows:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | SM | 57 893 31 | -66 92 3 | -194 800 214 |
## 2 | JM | 93 991 139 | 259 526 84 | -243 465 551 |
## 3 | SE | 264 1000 450 | -381 999 512 | -11 1 3 |
## 4 | JE | 456 1000 308 | 233 942 331 | 58 58 152 |
## 5 | SC | 130 999 71 | -201 865 70 | 79 133 81 |
##
## Columns:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | none | 316 1000 577 | -393 994 654 | -30 6 29 |
## 2 | lght | 233 984 83 | 99 327 31 | 141 657 463 |
## 3 | medm | 321 983 148 | 196 982 166 | 7 1 2 |
## 4 | hevy | 130 995 192 | 294 684 150 | -198 310 506 |
plot(smca,main = "smoke data")
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
## 1973年按入学和性别划分的伯克利大学研究生院申请人数最多的6个学院的汇总数据。
data("UCBAdmissions")
mca <- mjca(UCBAdmissions)
summary(mca)
##
## Principal inertias (eigenvalues):
##
## dim value % cum% scree plot
## 1 0.114945 80.5 80.5 ************************
## 2 0.005694 4.0 84.5 *
## 3 00000000 0.0 84.5
## 4 00000000 0.0 84.5
## 5 00000000 0.0 84.5
## -------- -----
## Total: 0.142840
##
##
## Columns:
## name mass qlt inr k=1 cor ctr k=2 cor ctr
## 1 | Admit:Admitted | 129 911 93 | 365 875 150 | 74 36 123 |
## 2 | Admit:Rejected | 204 911 59 | -231 875 95 | -47 36 78 |
## 3 | Gender:Female | 135 863 95 | -399 845 187 | 59 19 84 |
## 4 | Gender:Male | 198 863 65 | 272 845 127 | -40 19 57 |
## 5 | Dept:A | 69 838 117 | 512 837 156 | 13 1 2 |
## 6 | Dept:B | 43 829 124 | 573 824 123 | -45 5 15 |
## 7 | Dept:C | 68 731 108 | -270 594 43 | 130 137 199 |
## 8 | Dept:D | 58 832 106 | -110 828 6 | 7 3 0 |
## 9 | Dept:E | 43 812 117 | -384 787 55 | 69 25 35 |
## 10 | Dept:F | 53 737 116 | -355 547 58 | -210 190 406 |
par(family = "STKaiti")
plot(mca, mass = c(TRUE, TRUE),col = c("black","red","green","blue"),
main = "三维列联表对应分析")
library(ade4)
##
## Attaching package: 'ade4'
## The following object is masked from 'package:FactoMineR':
##
## reconst
library(CCA)
## Loading required package: fda
## Loading required package: splines
## Loading required package: Matrix
##
## Attaching package: 'fda'
## The following object is masked from 'package:graphics':
##
## matplot
## Loading required package: fields
## Loading required package: spam
## Loading required package: dotCall64
## Loading required package: grid
## Spam version 2.5-1 (2019-12-12) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
##
## Attaching package: 'spam'
## The following object is masked from 'package:Matrix':
##
## det
## The following objects are masked from 'package:base':
##
## backsolve, forwardsolve
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:cluster':
##
## votes.repub
## See https://github.com/NCAR/Fields for
## an extensive vignette, other supplements and source code
##
## Attaching package: 'fields'
## The following object is masked from 'package:ggfortify':
##
## unscale
## The following object is masked from 'package:psych':
##
## describe
library(candisc)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## Loading required package: heplots
##
## Attaching package: 'candisc'
## The following object is masked from 'package:stats':
##
## cancor
## 分析两组变量之间的相关性
## This data set gives the performances of 33 men's decathlon at the Olympic Games (1988).
data(olympic)
## is a data frame with 33 rows and 10 columns events of the decathlon: 100 meters (100), long jump (long), shotput (poid), high jump (haut), 400 meters (400), 110-meter hurdles (110), discus throw (disq), pole vault (perc), javelin (jave) and 1500 meters (1500).
olytab <- olympic$tab
summary(olytab)
## 100 long poid haut
## Min. :10.62 Min. :6.220 Min. :10.27 Min. :1.790
## 1st Qu.:11.02 1st Qu.:7.000 1st Qu.:13.15 1st Qu.:1.940
## Median :11.18 Median :7.090 Median :14.12 Median :1.970
## Mean :11.20 Mean :7.133 Mean :13.98 Mean :1.983
## 3rd Qu.:11.43 3rd Qu.:7.370 3rd Qu.:14.97 3rd Qu.:2.030
## Max. :11.57 Max. :7.720 Max. :16.60 Max. :2.270
## 400 110 disq perc
## Min. :47.44 Min. :14.18 Min. :34.36 Min. :4.000
## 1st Qu.:48.34 1st Qu.:14.72 1st Qu.:39.08 1st Qu.:4.600
## Median :49.15 Median :15.00 Median :42.32 Median :4.700
## Mean :49.28 Mean :15.05 Mean :42.35 Mean :4.739
## 3rd Qu.:49.98 3rd Qu.:15.38 3rd Qu.:44.80 3rd Qu.:4.900
## Max. :51.28 Max. :16.20 Max. :50.66 Max. :5.700
## jave 1500
## Min. :49.52 Min. :256.6
## 1st Qu.:55.42 1st Qu.:266.4
## Median :59.48 Median :272.1
## Mean :59.44 Mean :276.0
## 3rd Qu.:64.00 3rd Qu.:286.0
## Max. :72.60 Max. :303.2
head(olytab)
## 100 long poid haut 400 110 disq perc jave 1500
## 1 11.25 7.43 15.48 2.27 48.90 15.13 49.28 4.7 61.32 268.95
## 2 10.87 7.45 14.97 1.97 47.71 14.46 44.36 5.1 61.76 273.02
## 3 11.18 7.44 14.20 1.97 48.29 14.81 43.66 5.2 64.16 263.20
## 4 10.62 7.38 15.02 2.03 49.06 14.72 44.80 4.9 64.04 285.11
## 5 11.02 7.43 12.92 1.97 47.44 14.40 41.20 5.2 57.46 256.64
## 6 10.83 7.72 13.58 2.12 48.34 14.18 43.06 4.9 52.18 274.07
## 数据标准化
olytab_s <- as.data.frame(scale(olytab))
## 切分数据,分别为上肢相关和下肢相关的项目
## X: shotput (poid), discus throw (disq), javelin (jave), pole vault (perc) ;; arm
## Y: run100, run400, run1500, hurdle, long.jump, high.jump ;; leg
xname <- c("poid","disq","jave","perc")
yname <- c("100","long","haut","400","110","1500")
olytab_sX <- olytab_s[,xname]
olytab_sY <- olytab_s[,yname]
## 典型相关分析
olycca <- candisc::cancor(olytab_sX, olytab_sY)
summary(olycca)
##
## Canonical correlation analysis of:
## 4 X variables: poid, disq, jave, perc
## with 6 Y variables: 100, long, haut, 400, 110, 1500
##
## CanR CanRSQ Eigen percent cum scree
## 1 0.5866 0.34411 0.52464 47.87 47.87 ******************************
## 2 0.4852 0.23540 0.30788 28.09 75.96 ******************
## 3 0.3991 0.15927 0.18944 17.28 93.24 ***********
## 4 0.2626 0.06898 0.07409 6.76 100.00 ****
##
## Test of H0: The canonical correlations in the
## current row and all that follow are zero
##
## CanR LR test stat approx F numDF denDF Pr(> F)
## 1 0.58661 0.39254 1.04327 24 81.447 0.4252
## 2 0.48518 0.59848 0.90820 15 66.655 0.5590
## 3 0.39909 0.78273 0.81436 8 50.000 0.5934
## 4 0.26265 0.93102 0.64215 3 26.000 0.5948
##
## Raw canonical coefficients
##
## X variables:
## Xcan1 Xcan2 Xcan3 Xcan4
## poid 0.94660 -0.6088535 1.485529 0.770784
## disq -0.68111 1.4055079 -0.677713 -0.021927
## jave -0.28437 0.0074946 0.076189 -1.216793
## perc 0.73503 0.0662211 -0.836913 -0.253155
##
## Y variables:
## Ycan1 Ycan2 Ycan3 Ycan4
## 100 -0.251053 0.080169 -0.501680 -0.10119
## long 0.089086 0.289806 0.047488 -1.06577
## haut -0.092248 0.418897 -0.317659 -0.25448
## 400 0.254005 -0.469382 1.461976 -0.39689
## 110 -0.929616 0.278657 -0.341989 -0.40049
## 1500 -0.045226 1.202536 -0.294172 0.20546
par(mfrow = c(2,2))
plot(olycca,which = 1)
plot(olycca,which = 2)
plot(olycca,which = 3)
plot(olycca,which = 4)
## 可视化典型相关分析的相关系数
olycca <- matcor(olytab_sX, olytab_sY)
img.matcor(olycca,type = 2)
library(MASS)
library(klaR)
data("iris")
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 选择100个样本作为训练集,其余的作为测试集
set.seed(223)
index <- sample(nrow(iris),100)
iris_train <- iris[index,]
iris_test <- iris[-index,]
table(iris_train$Species)
##
## setosa versicolor virginica
## 33 32 35
table(iris_test$Species)
##
## setosa versicolor virginica
## 17 18 15
## 使用线性判别
irislda <- lda(Species~.,data = iris_train)
irislda
## Call:
## lda(Species ~ ., data = iris_train)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.33 0.32 0.35
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.084848 3.524242 1.496970 0.2545455
## versicolor 5.903125 2.753125 4.196875 1.3125000
## virginica 6.602857 2.951429 5.528571 1.9800000
##
## Coefficients of linear discriminants:
## LD1 LD2
## Sepal.Length 1.217393 0.1726320
## Sepal.Width 1.220191 -2.8811341
## Petal.Length -2.379484 0.1012736
## Petal.Width -3.102694 -1.5076948
##
## Proportion of trace:
## LD1 LD2
## 0.9906 0.0094
## 预测测试集
irisldapre <- predict(irislda,iris_test)
table(iris_test$Species,irisldapre$class)
##
## setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 16 2
## virginica 0 0 15
## 可以发现只有两个样本预测错误
plot(irislda,abbrev = 1)
## 使用klaR包中的partimat函数探索可视化判别分析的效果
## 线性判别分析
par(family = "STKaiti")
partimat(Species~.,data = iris_train,method="lda",
main = "线性判别")
## 二次判别分析
par(family = "STKaiti")
partimat(Species~.,data = iris_train,method="qda",
main = "二次判别")
## 使用二次判别
irisqda <- qda(Species~.,data = iris_train)
irisqda
## Call:
## qda(Species ~ ., data = iris_train)
##
## Prior probabilities of groups:
## setosa versicolor virginica
## 0.33 0.32 0.35
##
## Group means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## setosa 5.084848 3.524242 1.496970 0.2545455
## versicolor 5.903125 2.753125 4.196875 1.3125000
## virginica 6.602857 2.951429 5.528571 1.9800000
## 预测测试集
irisqdapre <- predict(irisqda,iris_test)
table(iris_test$Species,irisqdapre$class)
##
## setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 16 2
## virginica 0 0 15
## 可以发现识别精度和线性判别分析的结果一样
library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(arules)
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
## 分析购物篮数据等
## 读取数据
groupdata <- read_csv("data/chap6/dataset_group.csv",col_names = F)
## Parsed with column specification:
## cols(
## X1 = col_double(),
## X2 = col_character()
## )
summary(groupdata)
## X1 X2
## Min. : 1.0 Length:22343
## 1st Qu.: 292.0 Class :character
## Median : 582.0 Mode :character
## Mean : 576.4
## 3rd Qu.: 863.0
## Max. :1139.0
## 数据一共包含约38种商品,共有1139条购买记录
unique(groupdata$X2)
## [1] "yogurt" "pork"
## [3] "sandwich bags" "lunch meat"
## [5] "all- purpose" "flour"
## [7] "soda" "butter"
## [9] "vegetables" "beef"
## [11] "aluminum foil" "dinner rolls"
## [13] "shampoo" "mixes"
## [15] "soap" "laundry detergent"
## [17] "ice cream" "toilet paper"
## [19] "hand soap" "waffles"
## [21] "cheeses" "milk"
## [23] "dishwashing liquid/detergent" "individual meals"
## [25] "cereals" "tortillas"
## [27] "spaghetti sauce" "ketchup"
## [29] "sandwich loaves" "poultry"
## [31] "bagels" "eggs"
## [33] "juice" "pasta"
## [35] "paper towels" "coffee/tea"
## [37] "fruits" "sugar"
length(unique(groupdata$X1))
## [1] 1139
## 查看每样商品在数据中出现的次数
items_unm <- groupdata %>%
group_by(X2)%>%
summarise(num = n())
ggplot(items_unm,aes(x = reorder(X2,num),y = num)) +
theme_bw(base_family = "STKaiti",base_size = 10) +
geom_bar(stat = "identity",fill = "lightblue") +
labs(x = "商品",y = "商品出现次数") +
coord_flip() +
geom_text(aes(x = reorder(X2,num),y = num + 50,label = num),size = 3)
# 数据表转化为list
buy_data<- split(x=groupdata$X2,f=as.factor(groupdata$X1))
# 查看一共有多少个实例
sum(sapply(buy_data,length))
## [1] 22343
# 过滤掉每个购买记录中相同的实例
buy_data <- lapply(buy_data,unique)
sum(sapply(buy_data,length))
## [1] 16753
## 转化为("transactions")交易数据集
buy_data <- as(buy_data,"transactions")
## 1:可视化频繁项集
## 出现的频率大于0.25的项目
par(family = "STKaiti",cex = 0.7)
itemFrequencyPlot(buy_data,support = 0.25,col = "lightblue",
xlab = "频繁项目",ylab = "项目频率",
main = "频率>0.25的项目")
## 可视化top20的项目
par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(buy_data,top = 20,col = "lightblue",
xlab = "频繁项目",ylab = "项目频率",
main = "Top20的项目")
## 找到规则
myrule <- apriori(data = buy_data,
parameter = list(support = 0.25,
confidence = 0.4,
minlen = 1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.25 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 284
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[38 item(s), 1139 transaction(s)] done [0.00s].
## sorting and recoding items ... [38 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [57 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## 找到了57个规则
summary(myrule)
## set of 57 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2
## 2 55
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 1.965 2.000 2.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.2704 Min. :0.4002 Min. :1.000 Min. :308.0
## 1st Qu.:0.2915 1st Qu.:0.4216 1st Qu.:1.053 1st Qu.:332.0
## Median :0.3003 Median :0.7728 Median :1.065 Median :342.0
## Mean :0.3107 Mean :0.6631 Mean :1.066 Mean :353.9
## 3rd Qu.:0.3108 3rd Qu.:0.7875 3rd Qu.:1.078 3rd Qu.:354.0
## Max. :0.7392 Max. :0.8378 Max. :1.133 Max. :842.0
##
## mining info:
## data ntransactions support confidence
## buy_data 1139 0.25 0.4
## 关联分析2,指定项目的右侧出现的项目,找到频繁的规则
## 这次主要挖掘指定右项集为"ice cream"
myrule2 <- apriori(buy_data, #数据集
parameter = list(minlen =3, # 频数项集长度
maxlen = 8,# 项集的最大长度
supp = 0.1, ## 支持度阈值
conf = 0.45, ## 置信度阈值
target = "rules"),
## 设定右边项集只能出现"ice creame","fruits",
## 左项集默认参数
appearance = list(rhs=c("ice cream"),
default="lhs"))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.45 0.1 1 none FALSE TRUE 5 0.1 3
## maxlen target ext
## 8 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 113
##
## set item appearances ...[1 item(s)] done [0.00s].
## set transactions ...[38 item(s), 1139 transaction(s)] done [0.00s].
## sorting and recoding items ... [38 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [10 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(myrule2)
## set of 10 rules
##
## rule length distribution (lhs + rhs):sizes
## 3
## 10
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3 3 3 3 3
##
## summary of quality measures:
## support confidence lift count
## Min. :0.1291 Min. :0.4523 Min. :1.135 Min. :147.0
## 1st Qu.:0.1328 1st Qu.:0.4556 1st Qu.:1.143 1st Qu.:151.2
## Median :0.1343 Median :0.4675 Median :1.173 Median :153.0
## Mean :0.1371 Mean :0.4644 Mean :1.165 Mean :156.2
## 3rd Qu.:0.1431 3rd Qu.:0.4700 3rd Qu.:1.179 3rd Qu.:163.0
## Max. :0.1466 Max. :0.4758 Max. :1.194 Max. :167.0
##
## mining info:
## data ntransactions support confidence
## buy_data 1139 0.1 0.45
## 探索更详细的规则信息,将得到的规则按照提升度进行排序
myrule2_sortl <- sort(myrule2,by = "lift")
inspect(myrule2_sortl)
## lhs rhs support confidence lift
## [1] {paper towels,vegetables} => {ice cream} 0.1378402 0.4757576 1.193586
## [2] {sandwich loaves,vegetables} => {ice cream} 0.1343284 0.4751553 1.192075
## [3] {lunch meat,vegetables} => {ice cream} 0.1466198 0.4704225 1.180201
## [4] {aluminum foil,vegetables} => {ice cream} 0.1457419 0.4689266 1.176448
## [5] {cheeses,vegetables} => {ice cream} 0.1448639 0.4687500 1.176005
## [6] {pasta,vegetables} => {ice cream} 0.1334504 0.4662577 1.169752
## [7] {fruits,vegetables} => {ice cream} 0.1325724 0.4561934 1.144503
## [8] {soap,vegetables} => {ice cream} 0.1343284 0.4553571 1.142405
## [9] {beef,vegetables} => {ice cream} 0.1325724 0.4548193 1.141055
## [10] {individual meals,vegetables} => {ice cream} 0.1290606 0.4523077 1.134754
## count
## [1] 157
## [2] 153
## [3] 167
## [4] 166
## [5] 165
## [6] 152
## [7] 151
## [8] 153
## [9] 151
## [10] 147
## 可视化获取得到的规则
plot(myrule2, method="graph")
plot(myrule2, method="grouped")