library(ggplot2)
theme_set(theme_bw(base_family = "STKaiti"))
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data
library(e1071)
library(Metrics)
library(readr)
library(gmodels)
## 读取数据
Cancer <- read_csv("data/chap11/Breast Cancer Wisconsin.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## diagnosis = col_character()
## )
## See spec(...) for full column specifications.
Cancer$diagnosis <- as.factor(Cancer$diagnosis)
head(Cancer)
## # A tibble: 6 x 32
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 8.42e5 M 18.0 10.4 123. 1001
## 2 8.43e5 M 20.6 17.8 133. 1326
## 3 8.43e7 M 19.7 21.2 130 1203
## 4 8.43e7 M 11.4 20.4 77.6 386.
## 5 8.44e7 M 20.3 14.3 135. 1297
## 6 8.44e5 M 12.4 15.7 82.6 477.
## # … with 26 more variables: smoothness_mean <dbl>, compactness_mean <dbl>,
## # concavity_mean <dbl>, `concave points_mean` <dbl>, symmetry_mean <dbl>,
## # fractal_dimension_mean <dbl>, radius_se <dbl>, texture_se <dbl>,
## # perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## # compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## # symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## # texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## # smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## # `concave points_worst` <dbl>, symmetry_worst <dbl>,
## # fractal_dimension_worst <dbl>
summary(Cancer)
## id diagnosis radius_mean texture_mean
## Min. : 8670 B:357 Min. : 6.981 Min. : 9.71
## 1st Qu.: 869218 M:212 1st Qu.:11.700 1st Qu.:16.17
## Median : 906024 Median :13.370 Median :18.84
## Mean : 30371831 Mean :14.127 Mean :19.29
## 3rd Qu.: 8813129 3rd Qu.:15.780 3rd Qu.:21.80
## Max. :911320502 Max. :28.110 Max. :39.28
## perimeter_mean area_mean smoothness_mean compactness_mean
## Min. : 43.79 Min. : 143.5 Min. :0.05263 Min. :0.01938
## 1st Qu.: 75.17 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492
## Median : 86.24 Median : 551.1 Median :0.09587 Median :0.09263
## Mean : 91.97 Mean : 654.9 Mean :0.09636 Mean :0.10434
## 3rd Qu.:104.10 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040
## Max. :188.50 Max. :2501.0 Max. :0.16340 Max. :0.34540
## concavity_mean concave points_mean symmetry_mean fractal_dimension_mean
## Min. :0.00000 Min. :0.00000 Min. :0.1060 Min. :0.04996
## 1st Qu.:0.02956 1st Qu.:0.02031 1st Qu.:0.1619 1st Qu.:0.05770
## Median :0.06154 Median :0.03350 Median :0.1792 Median :0.06154
## Mean :0.08880 Mean :0.04892 Mean :0.1812 Mean :0.06280
## 3rd Qu.:0.13070 3rd Qu.:0.07400 3rd Qu.:0.1957 3rd Qu.:0.06612
## Max. :0.42680 Max. :0.20120 Max. :0.3040 Max. :0.09744
## radius_se texture_se perimeter_se area_se
## Min. :0.1115 Min. :0.3602 Min. : 0.757 Min. : 6.802
## 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606 1st Qu.: 17.850
## Median :0.3242 Median :1.1080 Median : 2.287 Median : 24.530
## Mean :0.4052 Mean :1.2169 Mean : 2.866 Mean : 40.337
## 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357 3rd Qu.: 45.190
## Max. :2.8730 Max. :4.8850 Max. :21.980 Max. :542.200
## smoothness_se compactness_se concavity_se concave points_se
## Min. :0.001713 Min. :0.002252 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509 1st Qu.:0.007638
## Median :0.006380 Median :0.020450 Median :0.02589 Median :0.010930
## Mean :0.007041 Mean :0.025478 Mean :0.03189 Mean :0.011796
## 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205 3rd Qu.:0.014710
## Max. :0.031130 Max. :0.135400 Max. :0.39600 Max. :0.052790
## symmetry_se fractal_dimension_se radius_worst texture_worst
## Min. :0.007882 Min. :0.0008948 Min. : 7.93 Min. :12.02
## 1st Qu.:0.015160 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08
## Median :0.018730 Median :0.0031870 Median :14.97 Median :25.41
## Mean :0.020542 Mean :0.0037949 Mean :16.27 Mean :25.68
## 3rd Qu.:0.023480 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72
## Max. :0.078950 Max. :0.0298400 Max. :36.04 Max. :49.54
## perimeter_worst area_worst smoothness_worst compactness_worst
## Min. : 50.41 Min. : 185.2 Min. :0.07117 Min. :0.02729
## 1st Qu.: 84.11 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720
## Median : 97.66 Median : 686.5 Median :0.13130 Median :0.21190
## Mean :107.26 Mean : 880.6 Mean :0.13237 Mean :0.25427
## 3rd Qu.:125.40 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910
## Max. :251.20 Max. :4254.0 Max. :0.22260 Max. :1.05800
## concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
## Min. :0.0000 Min. :0.00000 Min. :0.1565 Min. :0.05504
## 1st Qu.:0.1145 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.2267 Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.2722 Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.3829 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :1.2520 Max. :0.29100 Max. :0.6638 Max. :0.20750
table(Cancer$diagnosis)
##
## B M
## 357 212
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(pheatmap)
## 可视化碎石图,选择合适的主成分数
parpca <- fa.parallel(Cancer[,3:32],fa = "pc")
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor extraction method.
## Parallel analysis suggests that the number of factors = NA and the number of components = 5
## 主成分分析,提取前2个主成分
can_cor <- cor(Cancer[,3:32],Cancer[,3:32])
dim(can_cor)
## [1] 30 30
canpca <- principal(can_cor,nfactors = 5,rotate = "cluster")
## 可视化主成分和每个变量的相关性
pheatmap(canpca$loadings,cluster_rows = F,cluster_cols = F)
## 使用pca模型获取数据集的5个主成分
can_score<- predict.psych(canpca,Cancer[,3:32])
dim(can_score)
## [1] 569 5
head(can_score)
## RC1 RC5 RC2 RC3 RC4
## [1,] 2.14398826 2.6686188 1.2409385 0.1051650 -1.424708623
## [2,] 1.26818755 -0.4886897 -0.3731027 -1.0945307 -0.047042941
## [3,] 1.63678624 0.9139543 0.5164043 -0.4680318 0.164123014
## [4,] -0.07931035 5.1876659 2.0412375 0.9794555 -0.009439497
## [5,] 1.53865585 -0.1956127 0.4809143 0.1801097 -1.403098202
## [6,] -0.15095910 2.3711514 0.5010229 -0.4343232 -0.401721316
## 可视化前两个维度的数据分布
plotdata <- data.frame(PC1 = can_score[,1],PC2 = can_score[,2],
diagnosis = Cancer$diagnosis)
ggplot(plotdata,aes(x=PC1,y= PC2,colour = diagnosis,shape = diagnosis))+
geom_point()+theme(legend.position = "top")
## 数据70%用于训练,30%用于测试模型效果
## 为了方便可视化,只使用前两个主成分
set.seed(123)
index <- sample(nrow(can_score),round(0.7*nrow(can_score)))
cancerdata <- data.frame(can_score[,1:2])
cancerdata$diagnosis <- as.factor(Cancer$diagnosis)
colnames(cancerdata) <- c("PC1","PC2","y")
train_data <- cancerdata[index,]
test_data <- cancerdata[-index,]
## 线性核的支持向量机分类模型
can_svm <- svm(y~.,data = train_data,kernel ="linear")
summary(can_svm)
##
## Call:
## svm(formula = y ~ ., data = train_data, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 63
##
## ( 31 32 )
##
##
## Number of Classes: 2
##
## Levels:
## B M
## 在训练数据集上,可视化支持向量和分类面
plot(can_svm,train_data)
## 查看模型在测试集上的预测效果
test_pre <- as.character(predict(can_svm,test_data))
sprintf("base SVM model acc: %f",accuracy(test_data$y,test_pre))
## [1] "base SVM model acc: 0.959064"
table(test_data$y,test_pre)
## test_pre
## B M
## B 97 1
## M 6 67
CrossTable(test_data$y,test_pre,prop.r=F, prop.c=F,prop.t=F, prop.chisq=F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 171
##
##
## | test_pre
## test_data$y | B | M | Row Total |
## -------------|-----------|-----------|-----------|
## B | 97 | 1 | 98 |
## -------------|-----------|-----------|-----------|
## M | 6 | 67 | 73 |
## -------------|-----------|-----------|-----------|
## Column Total | 103 | 68 | 171 |
## -------------|-----------|-----------|-----------|
##
##
## 模型的精度能够提高吗?
## 调整参数提升模型的精度
set.seed(1)
svm_opt <- tune.svm(y~.,data = train_data,kernel ="linear",cost = seq(1,20,1))
## 可视化参数搜索的结果
plot(svm_opt,main = "SVM best cost parameter")
svm_opt$best.parameters
## cost
## 14 14
svm_opt$best.performance
## [1] 0.05794872
svmbest <- svm_opt$best.model
summary(svmbest)
##
## Call:
## best.svm(x = y ~ ., data = train_data, cost = seq(1, 20, 1), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 14
##
## Number of Support Vectors: 54
##
## ( 27 27 )
##
##
## Number of Classes: 2
##
## Levels:
## B M
## 查看模型在测试集上的预测效果
test_pre <- as.character(predict(svmbest,test_data))
sprintf("best SVM model acc: %f",accuracy(test_data$y,test_pre))
## [1] "best SVM model acc: 0.964912"
table(test_data$y,test_pre)
## test_pre
## B M
## B 97 1
## M 5 68
CrossTable(test_data$y,test_pre,prop.r=F, prop.c=F,prop.t=F, prop.chisq=F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 171
##
##
## | test_pre
## test_data$y | B | M | Row Total |
## -------------|-----------|-----------|-----------|
## B | 97 | 1 | 98 |
## -------------|-----------|-----------|-----------|
## M | 5 | 68 | 73 |
## -------------|-----------|-----------|-----------|
## Column Total | 102 | 69 | 171 |
## -------------|-----------|-----------|-----------|
##
##
## 针对线性核,虽然不同的cost参数下支持向量的数量不一样,但是在测试集上的预测精度是一样的
## 进一步的提高模型精度,搜索更多的参数
set.seed(1)
Optvm <- tune.svm(y~.,data = train_data,kernel ="radial",
cost = seq(1,20,1),gamma = seq(0.1,1,0.1))
plot(Optvm)
Optvm$best.parameters
## gamma cost
## 15 0.5 2
Optvm$best.performance
## [1] 0.05794872
svmbest2 <- Optvm$best.model
summary(svmbest2)
##
## Call:
## best.svm(x = y ~ ., data = train_data, gamma = seq(0.1, 1, 0.1),
## cost = seq(1, 20, 1), kernel = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 2
##
## Number of Support Vectors: 71
##
## ( 35 36 )
##
##
## Number of Classes: 2
##
## Levels:
## B M
## 可视化新的支持向量机分类面
plot(svmbest2,data = train_data)
## 查看模型在测试集上的预测效果
test_pre <- as.character(predict(svmbest2,test_data))
sprintf("best SVM model acc: %f",accuracy(test_data$y,test_pre))
## [1] "best SVM model acc: 0.947368"
table(test_data$y,test_pre)
## test_pre
## B M
## B 97 1
## M 8 65
CrossTable(test_data$y,test_pre,prop.r=F, prop.c=F,prop.t=F, prop.chisq=F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 171
##
##
## | test_pre
## test_data$y | B | M | Row Total |
## -------------|-----------|-----------|-----------|
## B | 97 | 1 | 98 |
## -------------|-----------|-----------|-----------|
## M | 8 | 65 | 73 |
## -------------|-----------|-----------|-----------|
## Column Total | 105 | 66 | 171 |
## -------------|-----------|-----------|-----------|
##
##
## 使用新的核函数得到了新的数据切分曲面,模型的分类效果比使用线性核效果好
识别二维数据中的异常值
异常值识别垃圾邮件
## 二维数据中的异常值
library(ggplot2)
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(e1071)
library(Rlof)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
# ## 生成随机数据
# set.seed(1234)
# samp1 <- mvrnorm(n = 200,mu=c(5,5),
# Sigma = matrix(c(1,0,0,1),nrow = 2,ncol = 2))
# samp1 <- samp1[samp1[,1] < 7,]
# samp2 <- mvrnorm(n = 3,mu=c(0,0),
# Sigma = matrix(c(5,0,0,5),nrow = 2,ncol = 2))
# samp3 <- mvrnorm(n = 2,mu=c(6,12),
# Sigma = matrix(c(1,0,0,2),nrow = 2,ncol = 2))
# samp4 <- mvrnorm(n = 2,mu=c(12,3),
# Sigma = matrix(c(3,0,0,4),nrow = 2,ncol = 2))
# samp5 <- mvrnorm(n = 1,mu=c(12,6),
# Sigma = matrix(c(1,0,0,1.5),nrow = 2,ncol = 2))
# samp6 <- mvrnorm(n = 2,mu=c(11,10),
# Sigma = matrix(c(2,0,0,2),nrow = 2,ncol = 2))
# samp7 <- mvrnorm(n = 3,mu=c(-1,10),
# Sigma = matrix(c(6,0,0,6),nrow = 2,ncol = 2))
# samp <- rbind(samp1,samp2,samp3,samp4,samp5,samp6,samp7)
# par(pty = c("s"))
# plot(samp[,1],samp[,2])
# ## 随机分布样本
# scatter2dim <- data.frame(x = samp[,1],y = samp[,2])
# index <- sample(nrow(scatter2dim),nrow(scatter2dim))
# scatter2dim <- scatter2dim[index,]
# plot(scatter2dim$x,scatter2dim$y)
# write.csv(scatter2dim,"data/chap10/outlierdata.csv",row.names = F)
## 读取异常值数据
outlierdata <- read.csv("data/chap11/outlierdata.csv")
## 可视化数据的分布
qplot(outlierdata$x,outlierdata$y,xlab = "X",ylab = "Y",main = "散点图")
## Lof方法识别异常值
lof_score <- lofactor(outlierdata,k=6)
outlierdata$lof_score <- lof_score
outlierdata$scorecut <- cut_width(lof_score,2)
## 可视化LOF得分
ggplot(outlierdata,aes(x=x,y=y,colour=lof_score))+
geom_point(aes(shape = scorecut))+
ggtitle("LOF得分")
## 可以指定距离,多核并行计算lof得分
lof_score2 <- Rlof::lof(outlierdata[,c("x","y")],k=6,cores = 2,
method = "maximum")
outlierdata$lof_score2 <- lof_score2
outlierdata$scorecut2 <- cut_width(lof_score2,2)
## 可视化LOF得分
ggplot(outlierdata,aes(x=x,y=y,colour=lof_score2))+
geom_point(aes(shape = scorecut2))+
ggtitle("LOF得分")
## 使用支持向量机来识别异常值SVM
out_svm <- svm(x = outlierdata[,c("x","y")],y = NULL,
type = "one-classification",nu = 0.08)
out_svm
##
## Call:
## svm.default(x = outlierdata[, c("x", "y")], y = NULL, type = "one-classification",
## nu = 0.08)
##
##
## Parameters:
## SVM-Type: one-classification
## SVM-Kernel: radial
## gamma: 0.5
## nu: 0.08
##
## Number of Support Vectors: 22
## 预测是否为异常值
svmpre <- predict(out_svm,outlierdata[,c("x","y")])
outlierdata$isoutlier <- !svmpre
## 可视化SVM的识别结果
ggplot(outlierdata,aes(x=x,y=y))+
geom_point(aes(shape = isoutlier))+
ggtitle("SVM识别异常值")
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(gmodels)
load("data/chap8/spam_cp.RData")
load("data/chap8/spam.RData")
## 构建TF-IDF矩阵
controls = list(weighting = function(x) weightTfIdf(x,normalize = FALSE))
spam_dtm <- DocumentTermMatrix(spam_cp,control = controls)
## 通常情况下,文档-词项的tf-idf矩阵非常的稀疏,可以删除一些不重要的词来缓解矩阵的稀疏性,同时提高计算效率
spam_dtm <- removeSparseTerms(spam_dtm,0.999)
spam_dtm
## <<DocumentTermMatrix (documents: 5572, terms: 1252)>>
## Non-/sparse entries: 36634/6939510
## Sparsity : 99%
## Maximal term length: 19
## Weighting : term frequency - inverse document frequency (tf-idf)
## 数据随机切分为75%训练集和25%测试集
set.seed(123)
index <- sample(nrow(spam),nrow(spam)*0.75)
spam_dtm2mat <- as.matrix(spam_dtm)
train_x <- spam_dtm2mat[index,]
train_y <- spam$label[index]
test_x <- spam_dtm2mat[-index,]
test_y <- spam$label[-index]
# train_x <- apply(train_x, 2, function(x) ifelse(x>0,1,0))
# test_x <- apply(test_x, 2, function(x) ifelse(x>0,1,0))
## 使用支持向量机来识别异常值SVM
out_svm <- svm(x = train_x,y = NULL,
type = "one-classification",nu = 0.1)
## 预测是否为异常值
svmpre <- predict(out_svm,test_x)
table(svmpre)
## svmpre
## FALSE TRUE
## 282 1111
table(test_y)
## test_y
## ham spam
## 1206 187
svmpre2 <- ifelse(svmpre == TRUE, "ham","spam")
CrossTable(test_y,svmpre2,prop.r = F,prop.t = F,prop.chisq = F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1393
##
##
## | svmpre2
## test_y | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 1005 | 201 | 1206 |
## | 0.905 | 0.713 | |
## -------------|-----------|-----------|-----------|
## spam | 106 | 81 | 187 |
## | 0.095 | 0.287 | |
## -------------|-----------|-----------|-----------|
## Column Total | 1111 | 282 | 1393 |
## | 0.798 | 0.202 | |
## -------------|-----------|-----------|-----------|
##
##
table(test_y,svmpre2)
## svmpre2
## test_y ham spam
## ham 1005 201
## spam 106 81
sprintf("异常值检测识别精度为%.4f",accuracy(test_y,svmpre2))
## [1] "异常值检测识别精度为0.7796"
library(e1071)
library(caret)
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(Metrics)
library(readr)
library(gmodels)
library(outliers)
##
## Attaching package: 'outliers'
## The following object is masked from 'package:psych':
##
## outlier
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrplot)
## corrplot 0.84 loaded
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
kc_house <- read_csv("data/chap11/kc_house_data.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## id = col_character(),
## date = col_datetime(format = "")
## )
## See spec(...) for full column specifications.
kc_house$date <- year(kc_house$date)
kc_house <- kc_house[,2:21]
## 剔除价格异常的数据
outlierval <- boxplot.stats(kc_house$price)$out
kc_house <- kc_house[!(kc_house$price %in% outlierval),]
dim(kc_house)
## [1] 20467 20
## 剔除 bedrooms 异常的数据(==0,>6)
index <- c(which(kc_house$bedrooms == 0),which(kc_house$bedrooms > 6))
kc_house <- kc_house[-index,]
dim(kc_house)
## [1] 20408 20
## 剔除 bathrooms 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$bathrooms)$out
kc_house <- kc_house[!(kc_house$bathrooms %in% outlierval),]
dim(kc_house)
## [1] 20316 20
## 剔除 sqft_lot 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$sqft_lot)$out
kc_house <- kc_house[!(kc_house$sqft_lot %in% outlierval),]
dim(kc_house)
## [1] 18112 20
## 剔除特征 waterfront and view
kc_house$waterfront <- NULL
kc_house$view <- NULL
## 剔除 condition 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$condition)$out
kc_house <- kc_house[!(kc_house$condition %in% outlierval),]
dim(kc_house)
## [1] 18091 18
## 剔除 sqft_basement 取值为异常值的数据
outlierval <- boxplot.stats(kc_house$sqft_basement)$out
kc_house <- kc_house[!(kc_house$sqft_basement %in% outlierval),]
dim(kc_house)
## [1] 17687 18
## 房子建立的时间长短,并剔除取值小于0的数据
kc_house$yr_built <- kc_house$date - kc_house$yr_built
index <- which(kc_house$yr_built < 0)
kc_house <- kc_house[-index,]
dim(kc_house)
## [1] 17675 18
## 房子是否装修过,装修过为1,没装修过为0
index <- which(kc_house$yr_renovated != 0)
kc_house$yr_renovated[index] <- 1
dim(kc_house)
## [1] 17675 18
kc_house$zipcode <- as.factor(kc_house$zipcode)
## 剔除不需要的特征
kc_house <- kc_house[,2:14]
## 可视化房子的售价分布
ggplot(kc_house)+geom_histogram(aes(price),fill = "lightblue",bins = 60)
## 可视化特征之间的相关系数
kc_cor <- cor(kc_house[1:11],kc_house[1:11])
corrplot.mixed(kc_cor,tl.col="black",tl.pos = "lt",
tl.cex = 0.8,number.cex = 0.7)
head(kc_house)
## # A tibble: 6 x 13
## price bedrooms bathrooms sqft_living sqft_lot floors condition grade
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 221900 3 1 1180 5650 1 3 7
## 2 538000 3 2.25 2570 7242 2 3 7
## 3 180000 2 1 770 10000 1 3 6
## 4 604000 4 3 1960 5000 1 5 7
## 5 510000 3 2 1680 8080 1 3 8
## 6 257500 3 2.25 1715 6819 2 3 7
## # … with 5 more variables: sqft_above <dbl>, sqft_basement <dbl>,
## # yr_built <dbl>, yr_renovated <dbl>, zipcode <fct>
summary(kc_house)
## price bedrooms bathrooms sqft_living
## Min. : 80000 Min. :1.000 Min. :0.500 Min. : 370
## 1st Qu.: 305000 1st Qu.:3.000 1st Qu.:1.500 1st Qu.:1360
## Median : 422500 Median :3.000 Median :2.000 Median :1780
## Mean : 461056 Mean :3.276 Mean :2.004 Mean :1871
## 3rd Qu.: 579000 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.:2290
## Max. :1127000 Max. :6.000 Max. :4.000 Max. :5050
##
## sqft_lot floors condition grade
## Min. : 520 Min. :1.000 Min. :2.000 Min. : 3.000
## 1st Qu.: 4800 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 7.000
## Median : 7110 Median :1.000 Median :3.000 Median : 7.000
## Mean : 7139 Mean :1.482 Mean :3.405 Mean : 7.451
## 3rd Qu.: 9122 3rd Qu.:2.000 3rd Qu.:4.000 3rd Qu.: 8.000
## Max. :18224 Max. :3.500 Max. :5.000 Max. :12.000
##
## sqft_above sqft_basement yr_built yr_renovated
## Min. : 370 Min. : 0.0 Min. : 0.00 Min. :0.00000
## 1st Qu.:1140 1st Qu.: 0.0 1st Qu.: 16.00 1st Qu.:0.00000
## Median :1460 Median : 0.0 Median : 42.00 Median :0.00000
## Mean :1637 Mean : 233.5 Mean : 44.09 Mean :0.03525
## 3rd Qu.:1990 3rd Qu.: 470.0 3rd Qu.: 64.00 3rd Qu.:0.00000
## Max. :5050 Max. :1250.0 Max. :115.00 Max. :1.00000
##
## zipcode
## 98103 : 579
## 98115 : 544
## 98117 : 530
## 98052 : 498
## 98034 : 496
## 98038 : 494
## (Other):14534
dim(kc_house)
## [1] 17675 13
anyNA(kc_house)
## [1] FALSE
write.csv(kc_house,"data/chap11/kc_house_clean.csv",row.names = FALSE)
## 切分数据
set.seed(123)
index <- sample(nrow(kc_house),round(0.7*nrow(kc_house)))
train_data <- kc_house[index,]
test_data <- kc_house[-index,]
dim(train_data)
## [1] 12372 13
dim(test_data)
## [1] 5303 13
## 支持向量机回归模型
system.time(
svmreg <- svm(price~.,data = train_data,kernel = "radial")
)
## user system elapsed
## 11.085 0.081 11.174
summary(svmreg)
##
## Call:
## svm(formula = price ~ ., data = train_data, kernel = "radial")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01234568
## epsilon: 0.1
##
##
## Number of Support Vectors: 8917
## 在训练集上的误差
train_pre <- predict(svmreg,train_data)
train_mae <- mae(train_data$price,train_pre)
sprintf("训练集上的绝对值误差: %f",train_mae)
## [1] "训练集上的绝对值误差: 56347.919309"
test_pre <- predict(svmreg,test_data)
test_mae <- mae(test_data$price,test_pre)
sprintf("测试集上的绝对值误差: %f",test_mae)
## [1] "测试集上的绝对值误差: 58359.507538"
## 该部分程序运行时间较长,此处先将其注释掉
#############################################
# set.seed(234)
# tunecontrols <- tune.control(sampling = "bootstrap",boot.size = 0.8)
# system.time(
# svmregopt <- tune(svm,price~.,data = train_data,kernel = "radial",
# ranges = list(gamma = seq(0.01,0.1,0.02),
# cost= seq(11,20,2)),
# tunecontrol = tunecontrols)
# )
## 找到的模型参数
# plot(svmregopt,main = "Performance of svm regression")
# svmregopt$best.parameters
# svmregopt$best.performance
#############################################
## 得到的最优参数为cost=19,gamma=0.01
## 使用找到的最好参数进行SVM回归
system.time(
svmregbest <- svm(price~.,data = train_data,kernel = "radial",
cost=19,gamma=0.01)
)
## user system elapsed
## 17.398 0.102 17.512
## 在训练集上的误差
train_pre <- predict(svmregbest,train_data)
train_mae <- mae(train_data$price,train_pre)
sprintf("训练集上的绝对值误差: %f",train_mae)
## [1] "训练集上的绝对值误差: 48616.775325"
test_pre <- predict(svmregbest,test_data)
test_mae <- mae(test_data$price,test_pre)
sprintf("测试集上的绝对值误差: %f",test_mae)
## [1] "测试集上的绝对值误差: 52979.473453"
## 模型和原始的支持向量回归相比,效果提升了5000多
library(neuralnet)
library(NeuralNetTools)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:neuralnet':
##
## compute
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(psych)
library(Metrics)
library(ggplot2)
## 读取数据
Cancer <- read_csv("data/chap11/Breast Cancer Wisconsin.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## diagnosis = col_character()
## )
## See spec(...) for full column specifications.
Cancer$diagnosis <- as.integer(as.factor(Cancer$diagnosis))-1
## 主成分分析,提取前5个主成分
can_cor <- cor(Cancer[,3:32],Cancer[,3:32])
canpca <- principal(can_cor,nfactors = 5,rotate = "cluster")
can_score<- predict.psych(canpca,Cancer[,3:32])
## 数据70%用于训练,30%用于测试模型效果
set.seed(123)
index <- sample(nrow(can_score),round(0.7*nrow(can_score)))
train_data <- as.data.frame(can_score[index,])
train_data$y <- Cancer$diagnosis[index]
test_data <- as.data.frame(can_score[-index,])
test_data$y <- Cancer$diagnosis[-index]
head(train_data)
## RC1 RC5 RC2 RC3 RC4 y
## 1 0.1011215 -0.8996009 -0.86740003 0.7000380 1.68584164 1
## 2 -0.3603427 -1.3415099 -0.53912539 -0.6659501 1.29527571 0
## 3 -0.7493287 -2.0504789 -1.11433658 -0.6384352 0.76219966 0
## 4 -1.2799680 0.6470246 -0.34020304 -0.1953687 -1.26199913 0
## 5 0.2618498 0.6918794 1.22911675 -0.2793941 0.51394461 1
## 6 0.4541706 1.8289565 0.05629867 -0.6531238 0.03885923 1
colnames(train_data)
## [1] "RC1" "RC5" "RC2" "RC3" "RC4" "y"
##
## 单隐藏层 MLP model
sigmlp <- neuralnet(y~RC1+RC5+RC2+RC3+RC4,data = train_data,
hidden = c(10),linear.output = FALSE,
act.fct = "logistic",algorithm = "rprop+")
par(cex = 0.6)
plotnet(sigmlp,pos_col = "red", neg_col = "grey")
summary(sigmlp)
## Length Class Mode
## call 7 -none- call
## response 398 -none- numeric
## covariate 1990 -none- numeric
## model.list 2 -none- list
## err.fct 1 -none- function
## act.fct 1 -none- function
## linear.output 1 -none- logical
## data 6 data.frame list
## exclude 0 -none- NULL
## net.result 1 -none- list
## weights 1 -none- list
## generalized.weights 1 -none- list
## startweights 1 -none- list
## result.matrix 74 -none- numeric
## 测试集上的精度
test_pre <- neuralnet::compute(sigmlp,test_data[,1:5])
test_pre <- as.vector(ifelse(test_pre$net.result >0.5,1,0))
accuracy(test_data$y,test_pre)
## [1] 0.9824561
## 训练单隐藏层神经网络,分析隐藏层的不同在测试集上的精度
set.seed(123)
size <- seq(2,20,1)
netacc <- size
for (ii in 1:length(size)) {
signet <- neuralnet(y~RC1+RC5+RC2+RC3+RC4,data = train_data,
hidden = c(size[ii]),linear.output = FALSE,
act.fct = "logistic",algorithm = "rprop+")
test_pre <- neuralnet::compute(signet,test_data[,1:5])
test_pre <- as.vector(ifelse(test_pre$net.result >0.5,1,0))
netacc[ii] <- accuracy(test_data$y,test_pre)
}
data.frame(size = size,netacc = netacc) %>%
ggplot(aes(x=size,y = netacc))+
geom_line()+geom_point(colour="red")
https://www.kaggle.com/zalando-research/fashionmnist
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
##
## day, hour, month, week, year
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
library(caret)
library(ggcorrplot)
library(RColorBrewer)
## 启动初始化一个h2o实例; 定义为2核同时计算;
h2o.init(nthreads=4,max_mem_size='8G')
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/dv/qd8nc5bx6xx3pxq_xx_j47240000gn/T//RtmpKr0MNP/h2o_sunlanxin_started_from_r.out
## /var/folders/dv/qd8nc5bx6xx3pxq_xx_j47240000gn/T//RtmpKr0MNP/h2o_sunlanxin_started_from_r.err
##
##
## Starting H2O JVM and connecting: . Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 seconds 576 milliseconds
## H2O cluster timezone: Asia/Shanghai
## H2O data parsing timezone: UTC
## H2O cluster version: 3.26.0.2
## H2O cluster version age: 4 months and 28 days !!!
## H2O cluster name: H2O_started_from_R_sunlanxin_hzc570
## H2O cluster total nodes: 1
## H2O cluster total memory: 7.11 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.6.2 (2019-12-12)
## Warning in h2o.clusterInfo():
## Your H2O cluster version is too old (4 months and 28 days)!
## Please download and install the latest version from http://h2o.ai/download/
## 因为数据量很大,为了节省计算时间,这里只使用部分数据集,完成训练和测试的工作
fashion <- read_csv("data/chap11/fashion-mnist_train.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
fashion_test <- read_csv("data/chap11/fashion-mnist_test.csv")
## Parsed with column specification:
## cols(
## .default = col_double()
## )
## See spec(...) for full column specifications.
fashion$label <- as.factor(fashion$label)
fashion[2:785] <- fashion[2:785] / 255.0
fashion_test[2:785] <- fashion_test[2:785] / 255.0
dim(fashion)
## [1] 60000 785
dim(fashion_test)
## [1] 10000 785
table(fashion$label)
##
## 0 1 2 3 4 5 6 7 8 9
## 6000 6000 6000 6000 6000 6000 6000 6000 6000 6000
table(fashion_test$label)
##
## 0 1 2 3 4 5 6 7 8 9
## 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000
## 可视化部分数据
## 可视化部分样本的图像
set.seed(123)
index <- sample(nrow(fashion),50)
par(mfrow = c(5,10),mai=c(0.05,0.05,0.05,0.05))
for(ii in seq_along(index)){
im <- as.numeric(fashion[index[ii],2:785])
im <- matrix(im,nrow=28,ncol = 28,byrow = F)
im <- apply(apply(im, 1, rev),1,rev)
image(im,col = gray(seq(0, 1, length = 256)),xaxt= "n", yaxt= "n")
}
## 数据切分为训练集和验证集,80%训练集,20%验证集
fashion_h2o <- as.h2o(fashion)
##
|
| | 0%
|
|======================================================================| 100%
splits <- h2o.splitFrame(data = fashion_h2o, ratios = 0.8,seed = 1234)
train_data <- splits[[1]]
val_data <- splits[[2]]
dim(train_data)
## [1] 48009 785
dim(val_data)
## [1] 11991 785
## MLP分类模型
mlpclf <- h2o.deeplearning(x = 2:785, y = 1, train_data, activation = "Tanh",
hidden=c(100,100,100,100),epochs = 5,
loss = "CrossEntropy",score_each_iteration = TRUE,
mini_batch_size = 500,validation_frame = val_data,
stopping_rounds=0,seed = 123)
##
|
| | 0%
|
|==== | 5%
|
|======= | 10%
|
|=========== | 15%
|
|============== | 20%
|
|================== | 26%
|
|===================== | 31%
|
|========================= | 36%
|
|============================= | 41%
|
|================================ | 46%
|
|==================================== | 51%
|
|======================================= | 56%
|
|=========================================== | 61%
|
|============================================== | 66%
|
|================================================== | 72%
|
|====================================================== | 77%
|
|========================================================= | 82%
|
|============================================================= | 87%
|
|================================================================ | 92%
|
|==================================================================== | 97%
|
|======================================================================| 100%
summary(mlpclf)
## Model Details:
## ==============
##
## H2OMultinomialModel: deeplearning
## Model Key: DeepLearning_model_R_1577262350391_1
## Status of Neuron Layers: predicting label, 10-class classification, multinomial distribution, CrossEntropy loss, 109,810 weights/biases, 1.4 MB, 245,594 training samples, mini-batch size 1
## layer units type dropout l1 l2 mean_rate rate_rms momentum
## 1 1 784 Input 0.00 % NA NA NA NA NA
## 2 2 100 Tanh 0.00 % 0.000000 0.000000 0.018767 0.049689 0.000000
## 3 3 100 Tanh 0.00 % 0.000000 0.000000 0.005727 0.001688 0.000000
## 4 4 100 Tanh 0.00 % 0.000000 0.000000 0.006707 0.002303 0.000000
## 5 5 100 Tanh 0.00 % 0.000000 0.000000 0.005533 0.004053 0.000000
## 6 6 10 Softmax NA 0.000000 0.000000 0.002659 0.001812 0.000000
## mean_weight weight_rms mean_bias bias_rms
## 1 NA NA NA NA
## 2 0.001021 0.080721 0.002030 0.077328
## 3 -0.001409 0.128151 -0.003879 0.129459
## 4 -0.000937 0.117577 -0.004774 0.177014
## 5 -0.001043 0.117010 0.021648 0.165734
## 6 -0.040423 0.479030 -0.265369 0.126030
##
## H2OMultinomialMetrics: deeplearning
## ** Reported on training data. **
## ** Metrics reported on temporary training frame with 9971 samples **
##
## Training Set Metrics:
## =====================
##
## MSE: (Extract with `h2o.mse`) 0.0761862
## RMSE: (Extract with `h2o.rmse`) 0.2760185
## Logloss: (Extract with `h2o.logloss`) 0.2599514
## Mean Per-Class Error: 0.09293306
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0 1 2 3 4 5 6 7 8 9 Error Rate
## 0 913 1 4 24 5 1 50 0 0 0 0.0852 = 85 / 998
## 1 4 993 0 17 1 0 0 0 0 0 0.0217 = 22 / 1,015
## 2 10 0 749 7 119 1 67 1 3 0 0.2173 = 208 / 957
## 3 26 2 4 921 19 0 20 0 1 0 0.0725 = 72 / 993
## 4 3 1 53 30 929 0 21 0 1 0 0.1050 = 109 / 1,038
## 5 0 0 0 0 0 924 0 42 3 4 0.0504 = 49 / 973
## 6 150 0 37 20 80 1 729 0 5 0 0.2867 = 293 / 1,022
## 7 0 0 0 0 0 4 0 1040 0 11 0.0142 = 15 / 1,055
## 8 5 0 3 3 3 2 7 3 947 0 0.0267 = 26 / 973
## 9 1 0 0 0 0 4 0 42 0 900 0.0496 = 47 / 947
## Totals 1112 997 850 1022 1156 937 894 1128 960 915 0.0929 = 926 / 9,971
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-10 Hit Ratios:
## k hit_ratio
## 1 1 0.907131
## 2 2 0.978939
## 3 3 0.993782
## 4 4 0.997192
## 5 5 0.998997
## 6 6 0.999498
## 7 7 1.000000
## 8 8 1.000000
## 9 9 1.000000
## 10 10 1.000000
##
##
## H2OMultinomialMetrics: deeplearning
## ** Reported on validation data. **
## ** Metrics reported on full validation frame **
##
## Validation Set Metrics:
## =====================
##
## Extract validation frame with `h2o.getFrame("RTMP_sid_af88_5")`
## MSE: (Extract with `h2o.mse`) 0.09976002
## RMSE: (Extract with `h2o.rmse`) 0.3158481
## Logloss: (Extract with `h2o.logloss`) 0.3595032
## Mean Per-Class Error: 0.1201635
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,valid = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 0 1 2 3 4 5 6 7 8 9 Error
## 0 1020 0 14 27 3 1 86 0 3 0 0.1161
## 1 4 1165 4 27 2 0 7 0 1 0 0.0372
## 2 33 0 929 10 147 0 95 0 4 0 0.2373
## 3 39 5 7 1113 37 1 16 0 1 0 0.0870
## 4 7 1 71 39 1012 1 58 0 6 0 0.1531
## 5 1 0 0 0 1 1133 0 67 2 16 0.0713
## 6 196 2 77 28 135 2 758 0 11 0 0.3730
## 7 0 0 0 0 0 10 0 1146 0 15 0.0213
## 8 10 0 8 11 7 6 12 8 1152 0 0.0511
## 9 0 0 0 0 0 7 0 57 0 1117 0.0542
## Totals 1310 1173 1110 1255 1344 1161 1032 1278 1180 1148 0.1206
## Rate
## 0 = 134 / 1,154
## 1 = 45 / 1,210
## 2 = 289 / 1,218
## 3 = 106 / 1,219
## 4 = 183 / 1,195
## 5 = 87 / 1,220
## 6 = 451 / 1,209
## 7 = 25 / 1,171
## 8 = 62 / 1,214
## 9 = 64 / 1,181
## Totals = 1,446 / 11,991
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,valid = TRUE)`
## =======================================================================
## Top-10 Hit Ratios:
## k hit_ratio
## 1 1 0.879410
## 2 2 0.966391
## 3 3 0.987657
## 4 4 0.993245
## 5 5 0.996331
## 6 6 0.998332
## 7 7 0.999083
## 8 8 0.999500
## 9 9 0.999917
## 10 10 1.000000
##
##
##
##
## Scoring History:
## timestamp duration training_speed epochs iterations samples
## 1 2019-12-25 16:26:43 0.000 sec NA 0.00000 0 0.000000
## 2 2019-12-25 16:26:54 12.073 sec 1246 obs/sec 0.25824 1 12398.000000
## 3 2019-12-25 16:27:05 22.503 sec 1250 obs/sec 0.51290 2 24624.000000
## 4 2019-12-25 16:27:15 32.844 sec 1257 obs/sec 0.76786 3 36864.000000
## 5 2019-12-25 16:27:25 43.263 sec 1262 obs/sec 1.02654 4 49283.000000
## training_rmse training_logloss training_r2 training_classification_error
## 1 NA NA NA NA
## 2 0.38479 0.51683 0.98185 0.18283
## 3 0.37234 0.50024 0.98301 0.16809
## 4 0.34642 0.41711 0.98529 0.14703
## 5 0.34626 0.43092 0.98530 0.14783
## validation_rmse validation_logloss validation_r2
## 1 NA NA NA
## 2 0.38976 0.53994 0.98141
## 3 0.37900 0.52415 0.98242
## 4 0.35785 0.45120 0.98433
## 5 0.36240 0.47978 0.98393
## validation_classification_error
## 1 NA
## 2 0.18297
## 3 0.16963
## 4 0.15537
## 5 0.16004
##
## ---
## timestamp duration training_speed epochs iterations
## 17 2019-12-25 16:29:29 2 min 46.949 sec 1273 obs/sec 4.09321 16
## 18 2019-12-25 16:29:39 2 min 57.197 sec 1274 obs/sec 4.34971 17
## 19 2019-12-25 16:29:50 3 min 7.455 sec 1275 obs/sec 4.60509 18
## 20 2019-12-25 16:30:00 3 min 17.598 sec 1275 obs/sec 4.85842 19
## 21 2019-12-25 16:30:10 3 min 28.234 sec 1273 obs/sec 5.11558 20
## 22 2019-12-25 16:30:11 3 min 28.960 sec 1273 obs/sec 5.11558 20
## samples training_rmse training_logloss training_r2
## 17 196511.000000 0.29082 0.29054 0.98963
## 18 208825.000000 0.27920 0.26486 0.99045
## 19 221086.000000 0.27384 0.25659 0.99081
## 20 233248.000000 0.27602 0.25995 0.99066
## 21 245594.000000 0.28806 0.27811 0.98983
## 22 245594.000000 0.27602 0.25995 0.99066
## training_classification_error validation_rmse validation_logloss
## 17 0.10200 0.32603 0.38912
## 18 0.09618 0.31754 0.36290
## 19 0.09116 0.32042 0.37049
## 20 0.09287 0.31585 0.35950
## 21 0.10260 0.32884 0.38735
## 22 0.09287 0.31585 0.35950
## validation_r2 validation_classification_error
## 17 0.98699 0.12993
## 18 0.98766 0.12167
## 19 0.98743 0.12676
## 20 0.98779 0.12059
## 21 0.98677 0.13160
## 22 0.98779 0.12059
##
## Variable Importances: (Extract with `h2o.varimp`)
## =================================================
##
## Variable Importances:
## variable relative_importance scaled_importance percentage
## 1 pixel771 1.000000 1.000000 0.002565
## 2 pixel15 0.945291 0.945291 0.002425
## 3 pixel47 0.926730 0.926730 0.002377
## 4 pixel14 0.820528 0.820528 0.002105
## 5 pixel39 0.819039 0.819039 0.002101
##
## ---
## variable relative_importance scaled_importance percentage
## 779 pixel493 0.362333 0.362333 0.000929
## 780 pixel702 0.360451 0.360451 0.000924
## 781 pixel34 0.352331 0.352331 0.000904
## 782 pixel701 0.338989 0.338989 0.000869
## 783 pixel86 0.332298 0.332298 0.000852
## 784 pixel464 0.327093 0.327093 0.000839
h2o.mse(mlpclf)
## [1] 0.0761862
## 可视化模型误差的收敛过程
plot(mlpclf)
## 查看5张图像经过MLP模型后的特征的变化情况
index <- seq(200,nrow(fashion),6000)[1:5]
par(mfrow = c(5,5),mai=c(0.1,0.1,0.1,0.1))
for(ii in seq_along(index)){
imone <- fashion_h2o[index[ii],2:785]
imlay1 <- h2o.deepfeatures(mlpclf,imone,layer = 1)
imlay2 <- h2o.deepfeatures(mlpclf,imone,layer = 2)
imlay3 <- h2o.deepfeatures(mlpclf,imone,layer = 3)
imlay4 <- h2o.deepfeatures(mlpclf,imone,layer = 4)
imone <- matrix(imone,nrow = 28,ncol = 28)
imone <- apply(apply(imone, 1, rev),1,rev)
image(imone,axes = FALSE,main = "image")
image(matrix(imlay1,nrow = 10,ncol = 10),axes = FALSE,main = "layer1")
image(matrix(imlay2,nrow = 10,ncol = 10),axes = FALSE,main = "layer2")
image(matrix(imlay3,nrow = 10,ncol = 10),axes = FALSE,main = "layer3")
image(matrix(imlay4,nrow = 10,ncol = 10),axes = FALSE,main = "layer4")
}
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================== | 79%
|
|======================================================================| 100%
##
|
| | 0%
|
|======================================================================| 100%
##
|
| | 0%
|
|================================================ | 69%
|
|======================================================================| 100%
##
|
| | 0%
|
|===================================== | 53%
|
|======================================================================| 100%
##
|
| | 0%
|
|============================== | 42%
|
|======================================================================| 100%
##
|
| | 0%
|
|============================================= | 64%
|
|======================================================================| 100%
## 计算模型在测试集上的预测值和性能
test_data <- as.h2o(fashion_test)
##
|
| | 0%
|
|======================================================================| 100%
mlp_pre <- as.data.frame(h2o.predict(mlpclf, newdata = test_data))
##
|
| | 0%
|
|======================================================================| 100%
head(mlp_pre)
## predict p0 p1 p2 p3 p4
## 1 0 8.825573e-01 1.203678e-06 1.778921e-03 1.775553e-03 2.529055e-04
## 2 1 1.449403e-05 9.998722e-01 1.012056e-06 9.725374e-05 2.594125e-06
## 3 2 4.532433e-02 1.355734e-04 8.328083e-01 4.993226e-04 1.903358e-03
## 4 0 5.127520e-01 1.256277e-05 1.503413e-01 1.447441e-02 1.189241e-03
## 5 4 2.124521e-02 3.261013e-04 1.334515e-01 1.978805e-01 3.295533e-01
## 6 6 8.384536e-02 5.816908e-05 1.659194e-02 8.954610e-03 2.609597e-03
## p5 p6 p7 p8 p9
## 1 6.663631e-06 1.133134e-01 3.670872e-05 2.442098e-05 2.529408e-04
## 2 5.980163e-07 1.143874e-05 2.479186e-07 5.611547e-08 1.547157e-07
## 3 1.324675e-04 1.188200e-01 2.710366e-05 1.739044e-04 1.756738e-04
## 4 2.937390e-05 3.202861e-01 4.385079e-04 1.792104e-04 2.972488e-04
## 5 2.358185e-04 3.158153e-01 1.299353e-04 8.065782e-04 5.557758e-04
## 6 1.850168e-04 8.867538e-01 1.081465e-04 8.766160e-04 1.678942e-05
acc <- accuracy(fashion_test$label,mlp_pre$predict)
sprintf("MLP model acc: %f",acc)
## [1] "MLP model acc: 0.880400"
## 在测试集上的混淆矩阵热力图
confum <- caret::confusionMatrix(as.factor(fashion_test$label),mlp_pre$predict)
confumat <- as.data.frame(confum$table)
confumat[,1:2] <- apply(confumat[,1:2],2,as.integer)
ggplot(confumat,aes(x=Reference,y = Prediction))+
geom_tile(aes(fill = Freq))+
geom_text(aes(label = Freq))+
scale_x_continuous(breaks = c(0:9))+
scale_y_continuous(breaks = unique(confumat$Prediction),
trans = "reverse")+
scale_fill_gradient2(low="darkblue", high="lightgreen",
guide="colorbar")+
ggtitle("MLP分类器在测试集结果")
library(RSNNS)
## Loading required package: Rcpp
##
## Attaching package: 'RSNNS'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, train
kc_house <- read_csv("data/chap11/kc_house_clean.csv")
## Parsed with column specification:
## cols(
## price = col_double(),
## bedrooms = col_double(),
## bathrooms = col_double(),
## sqft_living = col_double(),
## sqft_lot = col_double(),
## floors = col_double(),
## condition = col_double(),
## grade = col_double(),
## sqft_above = col_double(),
## sqft_basement = col_double(),
## yr_built = col_double(),
## yr_renovated = col_double(),
## zipcode = col_double()
## )
## 将zipcode进行label编码
zipcode <- decodeClassLabels(kc_house$zipcode)
kc_house <- cbind(kc_house[1:12],zipcode)
## 数据max-min归一化到0-1之间
kc_house[,2:11] <- normalizeData(kc_house[,2:11],"0_1")
## 房价归一化到0-1之间,并获取归一化参数
price <- normalizeData(kc_house$price,type = "0_1")
NormParameters <- getNormParameters(price)
head(kc_house)
## price bedrooms bathrooms sqft_living sqft_lot floors condition grade
## 1 221900 0.4 0.1428571 0.17307692 0.2897650 0.0 0.3333333 0.4444444
## 2 538000 0.4 0.5000000 0.47008547 0.3796882 0.4 0.3333333 0.4444444
## 3 180000 0.2 0.1428571 0.08547009 0.5354722 0.0 0.3333333 0.3333333
## 4 604000 0.6 0.7142857 0.33974359 0.2530502 0.0 1.0000000 0.4444444
## 5 510000 0.4 0.4285714 0.27991453 0.4270221 0.0 0.3333333 0.5555556
## 6 257500 0.4 0.5000000 0.28739316 0.3557953 0.4 0.3333333 0.4444444
## sqft_above sqft_basement yr_built yr_renovated 98001 98002 98003 98004 98005
## 1 0.17307692 0.000 0.5130435 0 0 0 0 0 0
## 2 0.38461538 0.320 0.5478261 1 0 0 0 0 0
## 3 0.08547009 0.000 0.7130435 0 0 0 0 0 0
## 4 0.14529915 0.728 0.4260870 0 0 0 0 0 0
## 5 0.27991453 0.000 0.2434783 0 0 0 0 0 0
## 6 0.28739316 0.000 0.1652174 0 0 0 1 0 0
## 98006 98007 98008 98010 98011 98014 98019 98022 98023 98024 98027 98028 98029
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 1 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98030 98031 98032 98033 98034 98038 98039 98040 98042 98045 98052 98053 98055
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98056 98058 98059 98065 98070 98072 98074 98075 98077 98092 98102 98103 98105
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 1 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98106 98107 98108 98109 98112 98115 98116 98117 98118 98119 98122 98125 98126
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 1 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
## 98133 98136 98144 98146 98148 98155 98166 98168 98177 98178 98188 98198 98199
## 1 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 1 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0
dim(kc_house)
## [1] 17675 82
hist(price)
## 数据切分
set.seed(123)
datasplist <- splitForTrainingAndTest(kc_house[,2:82],price,
ratio = 0.3)
## MLP回归模型
system.time(
mlpreg <- mlp(datasplist$inputsTrain,datasplist$targetsTrain,maxit = 200,
size = c(100,50,100,50),learnFunc = "Rprop",
inputsTest=datasplist$inputsTest,
targetsTest = datasplist$targetsTest,
metric = "RSME")
)
## user system elapsed
## 419.957 2.279 448.088
## 可视化模型的效果
plotIterativeError(mlpreg,main = "MLP Iterative Erro")
plotRegressionError(datasplist$targetsTrain,mlpreg$fitted.values,
main="MLP train fit")
plotRegressionError(datasplist$targetsTest,mlpreg$fittedTestValues,
main="MLP test fit")
## 在训练集上的误差
mlppre_train <- denormalizeData(mlpreg$fitted.values,NormParameters)
mlp_train <- denormalizeData(datasplist$targetsTrain,NormParameters)
train_mae <- mae(mlp_train,mlppre_train)
sprintf("训练集上的绝对值误差: %f",train_mae)
## [1] "训练集上的绝对值误差: 47971.117964"
mlppre_test <- denormalizeData(mlpreg$fittedTestValues,NormParameters)
mlp_test <- denormalizeData(datasplist$targetsTest,NormParameters)
test_mae <- mae(mlp_test,mlppre_test)
sprintf("测试集上的绝对值误差: %f",test_mae)
## [1] "测试集上的绝对值误差: 55811.877215"