library(ggplot2)
library(tidyr)
## 一元线型回归
onedata <- read.csv("data/chap5/simple linear regression.csv")
## 可视化数据
ggplot(onedata,aes(x = x,y = y))+
theme_bw()+
geom_point(colour = "red")+
geom_smooth(method='lm',formula=y~x)
summary(lm(y~x,data = onedata))
##
## Call:
## lm(formula = y ~ x, data = onedata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.6515 -2.0356 -0.1612 2.0239 8.3965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.461811 0.359560 -1.284 0.2
## x 1.014335 0.006162 164.598 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.037 on 298 degrees of freedom
## Multiple R-squared: 0.9891, Adjusted R-squared: 0.9891
## F-statistic: 2.709e+04 on 1 and 298 DF, p-value: < 2.2e-16
# x = seq(-5,10,length.out = 100)
# y = 0.85*x^3-4.2*x^2-10.05*x+3.78
# y <- y + rnorm(100,mean = 20,sd = 20)
# index <- sample(1:100,100)
# datad <- data.frame(x = x[index],y = y[index])
# plot(datad$x,datad$y)
# write.csv(datad,"data/chap5/Polynomial regression.csv",quote = F,row.names = F)
## 读取数据
polydata <- read.csv("data/chap5/Polynomial regression.csv")
## 可视化数据,很显然数据不适合作线性回归
ggplot(polydata,aes(x=x,y = y))+geom_point()+theme_bw()
## 拟合3次多项式方程查看效果
lmp3 <- lm(y~poly(x,3),data = polydata)
summary(lmp3)
##
## Call:
## lm(formula = y ~ poly(x, 3), data = polydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.092 -12.929 -0.862 11.151 41.806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.057 1.871 14.46 <2e-16 ***
## poly(x, 3)1 604.461 18.713 32.30 <2e-16 ***
## poly(x, 3)2 367.974 18.713 19.66 <2e-16 ***
## poly(x, 3)3 534.974 18.713 28.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.71 on 96 degrees of freedom
## Multiple R-squared: 0.959, Adjusted R-squared: 0.9578
## F-statistic: 749.1 on 3 and 96 DF, p-value: < 2.2e-16
poly3 <- predict(lmp3,polydata)
polydata$poly3 <- poly3
## 使用1,2,3,4次多项式回归拟合数据
lmp1 <- lm(y~poly(x,1),data = polydata)
poly1 <- predict(lmp1,polydata)
polydata$poly1 <- poly1
lmp2 <- lm(y~poly(x,2),data = polydata)
poly2 <- predict(lmp2,polydata)
polydata$poly2 <- poly2
lmp4 <- lm(y~poly(x,4),data = polydata)
poly4 <- predict(lmp4,polydata)
polydata$poly4 <- poly4
## 可视化模型各个多项式回归拟合的效果
polydatalong <- gather(polydata,key="model",value="value",
c("poly1","poly2","poly3","poly4"))
ggplot(polydatalong)+theme_bw()+geom_point(aes(x,y))+
geom_line(aes(x = x,y = value,linetype = model,colour = model),size = 0.8)+
theme(legend.position = c(0.1,0.8))
## 生成随机数
set.seed(123)
n <- 100
x <- runif(n,-5,5)
beta <- c(5,-2,0,0.05,0,0.01)
lmdata <- data.frame(x = x,x2=x^2,x3=x^3,x4=x^4,x5=x^5)
X <- cbind(rep(1,n),lmdata)
y <- as.matrix(X) %*% beta + rnorm(n,0,5)
lmdata$y <- y
## 拟合3次多项式回归
lm_mod <- lm(y~.,data = lmdata)
coef(lm_mod)
## (Intercept) x x2 x3 x4 x5
## 3.881775302 -3.710302516 0.202177359 0.300142441 -0.007071404 0.002174061
## 该函数使用吉布斯采样从具有高斯误差的线性回归模型的后验分布生成回归系数的估计
library(BAS)
lm_bas <- bas.lm(y~.,data = lmdata,method = "MCMC",
prior = "BIC",modelprior = uniform(),
MCMC.iterations = 10000)
## Warning in model.matrix.default(mt, mf, contrasts): non-list contrasts argument
## ignored
coef(lm_bas)
##
## Marginal Posterior Summaries of Coefficients:
##
## Using BMA
##
## Based on the top 16 models
## post mean post SD post p(B != 0)
## Intercept 4.901e+00 4.766e-01 1.000e+00
## x -3.902e+00 6.080e-01 9.993e-01
## x2 1.011e-02 4.683e-02 1.384e-01
## x3 3.387e-01 8.142e-02 9.653e-01
## x4 8.067e-05 1.880e-03 1.128e-01
## x5 7.061e-04 2.714e-03 1.424e-01
## 可视化两种模型的差异
lm_mod_pre <- predict(lm_mod,lmdata)
lm_bas_pre <- predict(lm_bas,lmdata)
poltdata <- data.frame(x=x,y=y)
poltdata$lm_mod_pre <- lm_mod_pre
poltdata$lm_bas_pre <- lm_bas_pre$fit
gather(poltdata,key="model",value="value",c("lm_mod_pre","lm_bas_pre"))%>%
ggplot()+theme_bw()+geom_point(aes(x,y))+
geom_line(aes(x = x,y = value,linetype = model,colour = model),size = 0.8)+
theme(legend.position = c(0.1,0.8))
library(ggcorrplot)
library(tidyr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## 美国不同地区的平均房价预测数据集
## 读取数据
house <- read.csv("data/chap5/USA_Housing.csv")
head(house)
## AvgAreaIncome AvgAreaHouseAge AvgAreaNumberRooms AvgAreaNumberofBedrooms
## 1 79545.46 5.682861 7.009188 4.09
## 2 79248.64 6.002900 6.730821 3.09
## 3 61287.07 5.865890 8.512727 5.13
## 4 63345.24 7.188236 5.586729 3.26
## 5 59982.20 5.040555 7.839388 4.23
## 6 80175.75 4.988408 6.104512 4.04
## AreaPopulation AvgPrice
## 1 23086.80 1059033.6
## 2 40173.07 1505890.9
## 3 36882.16 1058988.0
## 4 34310.24 1260616.8
## 5 26354.11 630943.5
## 6 26748.43 1068138.1
colnames(house)
## [1] "AvgAreaIncome" "AvgAreaHouseAge"
## [3] "AvgAreaNumberRooms" "AvgAreaNumberofBedrooms"
## [5] "AreaPopulation" "AvgPrice"
## 数据探索
summary(house)
## AvgAreaIncome AvgAreaHouseAge AvgAreaNumberRooms AvgAreaNumberofBedrooms
## Min. : 17797 Min. :2.644 Min. : 3.236 Min. :2.000
## 1st Qu.: 61481 1st Qu.:5.322 1st Qu.: 6.299 1st Qu.:3.140
## Median : 68804 Median :5.970 Median : 7.003 Median :4.050
## Mean : 68583 Mean :5.977 Mean : 6.988 Mean :3.981
## 3rd Qu.: 75783 3rd Qu.:6.651 3rd Qu.: 7.666 3rd Qu.:4.490
## Max. :107702 Max. :9.519 Max. :10.760 Max. :6.500
## AreaPopulation AvgPrice
## Min. : 172.6 Min. : 15939
## 1st Qu.:29403.9 1st Qu.: 997577
## Median :36199.4 Median :1232669
## Mean :36163.5 Mean :1232073
## 3rd Qu.:42861.3 3rd Qu.:1471210
## Max. :69621.7 Max. :2469066
## 可视化数据的分布
## 可视化密度曲线
houselong <- gather(house,key="varname",value="value",1:6)
ggplot(houselong)+theme_bw()+
geom_density(aes(value),fill = "red",alpha = 0.5)+
facet_wrap(.~varname,scales = "free")+
theme(axis.text.x = element_text(angle = 30))
## 数据可视化,相关系数热力图,分析变量之间的相关性
## 计算相关系数
house_cor <- cor(house)
ggcorrplot(house_cor,method = "square",lab = TRUE)+
theme(axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10))
## 多元线型回归
lm1 <- lm(AvgPrice~.,data = house)
summary(lm1)
##
## Call:
## lm(formula = AvgPrice ~ ., data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -337020 -70236 320 69175 361870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.637e+06 1.716e+04 -153.708 <2e-16 ***
## AvgAreaIncome 2.158e+01 1.343e-01 160.656 <2e-16 ***
## AvgAreaHouseAge 1.656e+05 1.443e+03 114.754 <2e-16 ***
## AvgAreaNumberRooms 1.207e+05 1.605e+03 75.170 <2e-16 ***
## AvgAreaNumberofBedrooms 1.651e+03 1.309e+03 1.262 0.207
## AreaPopulation 1.520e+01 1.442e-01 105.393 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101200 on 4994 degrees of freedom
## Multiple R-squared: 0.918, Adjusted R-squared: 0.9179
## F-statistic: 1.119e+04 on 5 and 4994 DF, p-value: < 2.2e-16
## 从输出的结果中可以发现AvgAreaNumberofBedrooms变量是不显著的
## 剔除该变量拟合新的回归模型
lm2 <- lm(AvgPrice~AvgAreaIncome+AvgAreaHouseAge+AvgAreaNumberRooms
+AreaPopulation,data = house)
summary(lm2)
##
## Call:
## lm(formula = AvgPrice ~ AvgAreaIncome + AvgAreaHouseAge + AvgAreaNumberRooms +
## AreaPopulation, data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -338419 -70058 132 69074 362025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.638e+06 1.716e+04 -153.73 <2e-16 ***
## AvgAreaIncome 2.158e+01 1.343e-01 160.74 <2e-16 ***
## AvgAreaHouseAge 1.657e+05 1.443e+03 114.77 <2e-16 ***
## AvgAreaNumberRooms 1.216e+05 1.423e+03 85.48 <2e-16 ***
## AreaPopulation 1.520e+01 1.442e-01 105.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 101200 on 4995 degrees of freedom
## Multiple R-squared: 0.918, Adjusted R-squared: 0.9179
## F-statistic: 1.398e+04 on 4 and 4995 DF, p-value: < 2.2e-16
## 可视化回归模型的系数
ggcoef(lm2,exclude_intercept = T,vline_color = "red",
errorbar_color = "blue",errorbar_height = 0.1)+
theme_bw()
## 可视化回归模型的图像
par(mfrow = c(2,2))
plot(lm2)
如果回归模型中有多个自变量是不显著的,可以使用逐步回归
library(readxl)
library(GGally)
library(Metrics)
library(car)
## Loading required package: carData
ENB <- read_excel("data/chap5/ENB2012.xlsx")
head(ENB)
## # A tibble: 6 x 9
## X1 X2 X3 X4 X5 X6 X7 X8 Y1
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.98 514. 294 110. 7 2 0 0 15.6
## 2 0.98 514. 294 110. 7 3 0 0 15.6
## 3 0.98 514. 294 110. 7 4 0 0 15.6
## 4 0.98 514. 294 110. 7 5 0 0 15.6
## 5 0.9 564. 318. 122. 7 2 0 0 20.8
## 6 0.9 564. 318. 122. 7 3 0 0 21.5
summary(ENB)
## X1 X2 X3 X4
## Min. :0.6200 Min. :514.5 Min. :245.0 Min. :110.2
## 1st Qu.:0.6825 1st Qu.:606.4 1st Qu.:294.0 1st Qu.:140.9
## Median :0.7500 Median :673.8 Median :318.5 Median :183.8
## Mean :0.7642 Mean :671.7 Mean :318.5 Mean :176.6
## 3rd Qu.:0.8300 3rd Qu.:741.1 3rd Qu.:343.0 3rd Qu.:220.5
## Max. :0.9800 Max. :808.5 Max. :416.5 Max. :220.5
## X5 X6 X7 X8 Y1
## Min. :3.50 Min. :2.00 Min. :0.0000 Min. :0.000 Min. : 6.01
## 1st Qu.:3.50 1st Qu.:2.75 1st Qu.:0.1000 1st Qu.:1.750 1st Qu.:12.99
## Median :5.25 Median :3.50 Median :0.2500 Median :3.000 Median :18.95
## Mean :5.25 Mean :3.50 Mean :0.2344 Mean :2.812 Mean :22.31
## 3rd Qu.:7.00 3rd Qu.:4.25 3rd Qu.:0.4000 3rd Qu.:4.000 3rd Qu.:31.67
## Max. :7.00 Max. :5.00 Max. :0.4000 Max. :5.000 Max. :43.10
str(ENB)
## Classes 'tbl_df', 'tbl' and 'data.frame': 768 obs. of 9 variables:
## $ X1: num 0.98 0.98 0.98 0.98 0.9 0.9 0.9 0.9 0.86 0.86 ...
## $ X2: num 514 514 514 514 564 ...
## $ X3: num 294 294 294 294 318 ...
## $ X4: num 110 110 110 110 122 ...
## $ X5: num 7 7 7 7 7 7 7 7 7 7 ...
## $ X6: num 2 3 4 5 2 3 4 5 2 3 ...
## $ X7: num 0 0 0 0 0 0 0 0 0 0 ...
## $ X8: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Y1: num 15.6 15.6 15.6 15.6 20.8 ...
## 数据探索
## 可视化矩阵散点图
ggscatmat(ENB)+theme(axis.text.x = element_text(angle = 60))
## 数据切分为训练集和测试集,训练集70%
set.seed(12)
index <- sample(nrow(ENB),round(nrow(ENB)*0.7))
trainEnb <- ENB[index,]
testENB <- ENB[-index,]
Enblm <- lm(Y1~.,data = trainEnb)
summary(Enblm)
##
## Call:
## lm(formula = Y1 ~ ., data = trainEnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.267 -1.338 -0.076 1.357 7.405
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.469237 23.761521 3.765 0.000185 ***
## X1 -68.556960 12.831595 -5.343 1.36e-07 ***
## X2 -0.091776 0.021423 -4.284 2.18e-05 ***
## X3 0.063317 0.008502 7.448 3.88e-13 ***
## X4 NA NA NA NA
## X5 4.168633 0.421822 9.882 < 2e-16 ***
## X6 -0.027859 0.114805 -0.243 0.808362
## X7 19.173761 0.995791 19.255 < 2e-16 ***
## X8 0.181443 0.085545 2.121 0.034384 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.979 on 530 degrees of freedom
## Multiple R-squared: 0.9137, Adjusted R-squared: 0.9126
## F-statistic: 801.9 on 7 and 530 DF, p-value: < 2.2e-16
## Coefficients: (1 not defined because of singularities)
## 因为奇异性问题,有一个变量没有计算系数
prelm <- predict(Enblm,testENB)
## Warning in predict.lm(Enblm, testENB): prediction from a rank-deficient fit may
## be misleading
## Mean Squared Error
sprintf("均方根误差为: %f",mse(testENB$Y1,prelm))
## [1] "均方根误差为: 8.113635"
## 判断模型的多重共线性问题
kappa(Enblm,exact=TRUE) #exact=TRUE表示精确计算条件数;
## [1] 2.045768e+15
## 1.740667e+15 条件数很大,说明数据之间具有很强的多重共线性
## vif(Enblm) 会出错,提示模型中有aliased coefficients
alias(Enblm)
## Model :
## Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8
##
## Complete :
## (Intercept) X1 X2 X3 X5 X6 X7 X8
## X4 0 0 1/2 -1/2 0 0 0 0
## 逐步回归
Enbstep <- step(Enblm,direction = "both")
## Start: AIC=1182.64
## Y1 ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8
##
##
## Step: AIC=1182.64
## Y1 ~ X1 + X2 + X3 + X5 + X6 + X7 + X8
##
## Df Sum of Sq RSS AIC
## - X6 1 0.5 4705.3 1180.7
## <none> 4704.8 1182.6
## - X8 1 39.9 4744.7 1185.2
## - X2 1 162.9 4867.7 1199.0
## - X1 1 253.4 4958.2 1208.9
## - X3 1 492.4 5197.2 1234.2
## - X5 1 867.0 5571.8 1271.6
## - X7 1 3291.1 7995.9 1466.0
##
## Step: AIC=1180.7
## Y1 ~ X1 + X2 + X3 + X5 + X7 + X8
##
## Df Sum of Sq RSS AIC
## <none> 4705.3 1180.7
## + X6 1 0.5 4704.8 1182.6
## - X8 1 39.9 4745.2 1183.2
## - X2 1 163.1 4868.4 1197.0
## - X1 1 254.0 4959.3 1207.0
## - X3 1 492.0 5197.3 1232.2
## - X5 1 868.1 5573.4 1269.8
## - X7 1 3290.8 7996.1 1464.0
summary(Enbstep)
##
## Call:
## lm(formula = Y1 ~ X1 + X2 + X3 + X5 + X7 + X8, data = trainEnb)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.310 -1.335 -0.055 1.355 7.420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.467713 23.740454 3.769 0.000183 ***
## X1 -68.621845 12.817435 -5.354 1.28e-07 ***
## X2 -0.091814 0.021404 -4.290 2.13e-05 ***
## X3 0.063215 0.008484 7.451 3.78e-13 ***
## X5 4.170632 0.421367 9.898 < 2e-16 ***
## X7 19.172759 0.994900 19.271 < 2e-16 ***
## X8 0.181390 0.085469 2.122 0.034275 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.977 on 531 degrees of freedom
## Multiple R-squared: 0.9137, Adjusted R-squared: 0.9127
## F-statistic: 937.2 on 6 and 531 DF, p-value: < 2.2e-16
## 判断模型的多重共线性问题
kappa(Enbstep,exact=TRUE)
## [1] 157187.8
## 150955.4 条件减小了约10^10倍,说明数据之间的多重共线性问题得到了大大的缓解
vif(Enbstep)
## X1 X2 X3 X5 X7 X8
## 108.869289 211.529846 7.914420 32.990674 1.027083 1.027811
## 计算在测试集上的预测误差
prestep <- predict(Enbstep,testENB)
## Mean Squared Error
sprintf("均方根误差为: %f",mse(testENB$Y1,prestep))
## [1] "均方根误差为: 8.111581"
## 可视化测试集数据和原始数据
## 数据准备
index <- order(testENB$Y1)
X <- sort(index)
Y1 <- testENB$Y1[index]
lmpre <- prelm[index]
steppre <- prestep[index]
plotdata <- data.frame(X = X,Y1 = Y1,lmpre =lmpre,steppre = steppre)
head(plotdata)
## X Y1 lmpre steppre
## 13 1 6.01 5.770176 5.788433
## 12 2 6.05 5.798034 5.788433
## 14 3 6.85 7.859234 7.818214
## 16 4 7.10 9.162972 9.176214
## 15 5 7.18 9.218689 9.176214
## 19 6 8.45 10.510743 10.519776
plotdata <- gather(plotdata,key="model",value="value",c(-X,-Y1))
## 可视化
ggplot(plotdata,aes(x = X))+theme_bw()+
geom_point(aes(y = Y1),colour = "red",alpha = 0.5)+
geom_line(aes(y = value,linetype = model,colour = model),size = 0.6)+
theme(legend.position = c(0.1,0.8))
数据来源:https://www.kaggle.com/primaryobjects/voicegender/home
This database was created to identify a voice as male or female, based upon acoustic properties of the voice and speech. The dataset consists of 3,168 recorded voice samples, collected from male and female speakers. The voice samples are pre-processed by acoustic analysis in R using the seewave and tuneR packages, with an analyzed frequency range of 0hz-280hz (human vocal range).
The following acoustic properties of each voice are measured and included within the CSV:
meanfreq: mean frequency (in kHz)
sd: standard deviation of frequency
median: median frequency (in kHz)
Q25: first quantile (in kHz)
Q75: third quantile (in kHz)
IQR: interquantile range (in kHz)
skew: skewness (see note in specprop description)
kurt: kurtosis (see note in specprop description)
sp.ent: spectral entropy
sfm: spectral flatness
mode: mode frequency
centroid: frequency centroid (see specprop)
peakf: peak frequency (frequency with highest energy)
meanfun: average of fundamental frequency measured across acoustic signal
minfun: minimum fundamental frequency measured across acoustic signal
maxfun: maximum fundamental frequency measured across acoustic signal
meandom: average of dominant frequency measured across acoustic signal
mindom: minimum of dominant frequency measured across acoustic signal
maxdom: maximum of dominant frequency measured across acoustic signal
dfrange: range of dominant frequency measured across acoustic signal
modindx: modulation index. Calculated as the accumulated absolute difference between adjacent measurements of fundamental frequencies divided by the frequency range
label: male or female
## 数据准备和探索
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(tidyr)
library(corrplot)
## corrplot 0.84 loaded
voice <- read.csv("data/chap5/voice.csv",stringsAsFactors = F)
head(voice)
## meanfreq sd median Q25 Q75 IQR skew
## 1 0.05978098 0.06424127 0.03202691 0.015071489 0.09019344 0.07512195 12.863462
## 2 0.06600874 0.06731003 0.04022873 0.019413867 0.09266619 0.07325232 22.423285
## 3 0.07731550 0.08382942 0.03671846 0.008701057 0.13190802 0.12320696 30.757155
## 4 0.15122809 0.07211059 0.15801119 0.096581728 0.20795525 0.11137352 1.232831
## 5 0.13512039 0.07914610 0.12465623 0.078720218 0.20604493 0.12732471 1.101174
## 6 0.13278641 0.07955687 0.11908985 0.067957993 0.20959160 0.14163361 1.932562
## kurt sp.ent sfm mode centroid meanfun minfun
## 1 274.402906 0.8933694 0.4919178 0.00000000 0.05978098 0.08427911 0.01570167
## 2 634.613855 0.8921932 0.5137238 0.00000000 0.06600874 0.10793655 0.01582591
## 3 1024.927705 0.8463891 0.4789050 0.00000000 0.07731550 0.09870626 0.01565558
## 4 4.177296 0.9633225 0.7272318 0.08387819 0.15122809 0.08896485 0.01779755
## 5 4.333713 0.9719551 0.7835681 0.10426140 0.13512039 0.10639784 0.01693122
## 6 8.308895 0.9631813 0.7383070 0.11255543 0.13278641 0.11013192 0.01711230
## maxfun meandom mindom maxdom dfrange modindx label
## 1 0.2758621 0.007812500 0.0078125 0.0078125 0.0000000 0.00000000 male
## 2 0.2500000 0.009014423 0.0078125 0.0546875 0.0468750 0.05263158 male
## 3 0.2711864 0.007990057 0.0078125 0.0156250 0.0078125 0.04651163 male
## 4 0.2500000 0.201497396 0.0078125 0.5625000 0.5546875 0.24711908 male
## 5 0.2666667 0.712812500 0.0078125 5.4843750 5.4765625 0.20827389 male
## 6 0.2539683 0.298221983 0.0078125 2.7265625 2.7187500 0.12515964 male
summary(voice)
## meanfreq sd median Q25
## Min. :0.03936 Min. :0.01836 Min. :0.01097 Min. :0.0002288
## 1st Qu.:0.16366 1st Qu.:0.04195 1st Qu.:0.16959 1st Qu.:0.1110865
## Median :0.18484 Median :0.05916 Median :0.19003 Median :0.1402864
## Mean :0.18091 Mean :0.05713 Mean :0.18562 Mean :0.1404556
## 3rd Qu.:0.19915 3rd Qu.:0.06702 3rd Qu.:0.21062 3rd Qu.:0.1759388
## Max. :0.25112 Max. :0.11527 Max. :0.26122 Max. :0.2473469
## Q75 IQR skew kurt
## Min. :0.04295 Min. :0.01456 Min. : 0.1417 Min. : 2.068
## 1st Qu.:0.20875 1st Qu.:0.04256 1st Qu.: 1.6496 1st Qu.: 5.670
## Median :0.22568 Median :0.09428 Median : 2.1971 Median : 8.319
## Mean :0.22476 Mean :0.08431 Mean : 3.1402 Mean : 36.569
## 3rd Qu.:0.24366 3rd Qu.:0.11418 3rd Qu.: 2.9317 3rd Qu.: 13.649
## Max. :0.27347 Max. :0.25223 Max. :34.7255 Max. :1309.613
## sp.ent sfm mode centroid
## Min. :0.7387 Min. :0.03688 Min. :0.0000 Min. :0.03936
## 1st Qu.:0.8618 1st Qu.:0.25804 1st Qu.:0.1180 1st Qu.:0.16366
## Median :0.9018 Median :0.39634 Median :0.1866 Median :0.18484
## Mean :0.8951 Mean :0.40822 Mean :0.1653 Mean :0.18091
## 3rd Qu.:0.9287 3rd Qu.:0.53368 3rd Qu.:0.2211 3rd Qu.:0.19915
## Max. :0.9820 Max. :0.84294 Max. :0.2800 Max. :0.25112
## meanfun minfun maxfun meandom
## Min. :0.05557 Min. :0.009775 Min. :0.1031 Min. :0.007812
## 1st Qu.:0.11700 1st Qu.:0.018223 1st Qu.:0.2540 1st Qu.:0.419828
## Median :0.14052 Median :0.046110 Median :0.2712 Median :0.765795
## Mean :0.14281 Mean :0.036802 Mean :0.2588 Mean :0.829211
## 3rd Qu.:0.16958 3rd Qu.:0.047904 3rd Qu.:0.2775 3rd Qu.:1.177166
## Max. :0.23764 Max. :0.204082 Max. :0.2791 Max. :2.957682
## mindom maxdom dfrange modindx
## Min. :0.004883 Min. : 0.007812 Min. : 0.000 Min. :0.00000
## 1st Qu.:0.007812 1st Qu.: 2.070312 1st Qu.: 2.045 1st Qu.:0.09977
## Median :0.023438 Median : 4.992188 Median : 4.945 Median :0.13936
## Mean :0.052647 Mean : 5.047277 Mean : 4.995 Mean :0.17375
## 3rd Qu.:0.070312 3rd Qu.: 7.007812 3rd Qu.: 6.992 3rd Qu.:0.20918
## Max. :0.458984 Max. :21.867188 Max. :21.844 Max. :0.93237
## label
## Length:3168
## Class :character
## Mode :character
##
##
##
table(voice$label)
##
## female male
## 1584 1584
str(voice)
## 'data.frame': 3168 obs. of 21 variables:
## $ meanfreq: num 0.0598 0.066 0.0773 0.1512 0.1351 ...
## $ sd : num 0.0642 0.0673 0.0838 0.0721 0.0791 ...
## $ median : num 0.032 0.0402 0.0367 0.158 0.1247 ...
## $ Q25 : num 0.0151 0.0194 0.0087 0.0966 0.0787 ...
## $ Q75 : num 0.0902 0.0927 0.1319 0.208 0.206 ...
## $ IQR : num 0.0751 0.0733 0.1232 0.1114 0.1273 ...
## $ skew : num 12.86 22.42 30.76 1.23 1.1 ...
## $ kurt : num 274.4 634.61 1024.93 4.18 4.33 ...
## $ sp.ent : num 0.893 0.892 0.846 0.963 0.972 ...
## $ sfm : num 0.492 0.514 0.479 0.727 0.784 ...
## $ mode : num 0 0 0 0.0839 0.1043 ...
## $ centroid: num 0.0598 0.066 0.0773 0.1512 0.1351 ...
## $ meanfun : num 0.0843 0.1079 0.0987 0.089 0.1064 ...
## $ minfun : num 0.0157 0.0158 0.0157 0.0178 0.0169 ...
## $ maxfun : num 0.276 0.25 0.271 0.25 0.267 ...
## $ meandom : num 0.00781 0.00901 0.00799 0.2015 0.71281 ...
## $ mindom : num 0.00781 0.00781 0.00781 0.00781 0.00781 ...
## $ maxdom : num 0.00781 0.05469 0.01562 0.5625 5.48438 ...
## $ dfrange : num 0 0.04688 0.00781 0.55469 5.47656 ...
## $ modindx : num 0 0.0526 0.0465 0.2471 0.2083 ...
## $ label : chr "male" "male" "male" "male" ...
## 可视化相关系数
voice_cor <- cor(voice[,1:20])
ggcorrplot(voice_cor,method = "square")
corrplot.mixed(voice_cor,tl.col="black",tl.pos = "lt",
tl.cex = 0.8,number.cex = 0.45)
## 可视化不同的特征在两种数据下的分布
plotdata <- gather(voice,key="variable",value="value",c(-label))
ggplot(plotdata,aes(fill = label))+
theme_bw()+geom_density(aes(value),alpha = 0.5)+
facet_wrap(~variable,scales = "free")
voice$label <- factor(voice$label,levels = c("male","female"),labels = c(0,1))
## 数据集切分为70%训练集和30%测试集
index <- createDataPartition(voice$label,p = 0.7)
voicetrain <- voice[index$Resample1,]
voicetest <- voice[-index$Resample1,]
## 在训练集上训练模型
## 使用所有变量进行逻辑回归
voicelm <- glm(label~.,data = voicetrain,family = "binomial")
summary(voicelm)
##
## Call:
## glm(formula = label ~ ., family = "binomial", data = voicetrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2583 -0.0954 -0.0004 0.0342 2.8631
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.602220 11.554693 0.744 0.456587
## meanfreq 82.593652 54.658578 1.511 0.130767
## sd 55.623097 41.001138 1.357 0.174901
## median -15.768692 15.340264 -1.028 0.303984
## Q25 50.386155 14.016350 3.595 0.000325 ***
## Q75 -91.480987 24.296415 -3.765 0.000166 ***
## IQR NA NA NA NA
## skew -0.083825 0.206253 -0.406 0.684434
## kurt 0.006574 0.005337 1.232 0.218058
## sp.ent -41.698545 12.714835 -3.280 0.001040 **
## sfm 11.019482 3.157580 3.490 0.000483 ***
## mode -2.967558 2.748219 -1.080 0.280226
## centroid NA NA NA NA
## meanfun 167.923619 11.030889 15.223 < 2e-16 ***
## minfun -35.846030 12.712735 -2.820 0.004807 **
## maxfun 3.352720 7.809408 0.429 0.667692
## meandom -0.146038 0.541165 -0.270 0.787269
## mindom 4.304827 3.132584 1.374 0.169377
## maxdom -0.007558 0.079362 -0.095 0.924126
## dfrange NA NA NA NA
## modindx 2.280421 1.994023 1.144 0.252778
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3074.80 on 2217 degrees of freedom
## Residual deviance: 376.44 on 2200 degrees of freedom
## AIC: 412.44
##
## Number of Fisher Scoring iterations: 8
## 对逻辑回归模型进行逐步回归,来筛选变量
voicelmstep <- step(voicelm,direction = "both")
## Start: AIC=412.44
## label ~ meanfreq + sd + median + Q25 + Q75 + IQR + skew + kurt +
## sp.ent + sfm + mode + centroid + meanfun + minfun + maxfun +
## meandom + mindom + maxdom + dfrange + modindx
##
##
## Step: AIC=412.44
## label ~ meanfreq + sd + median + Q25 + Q75 + IQR + skew + kurt +
## sp.ent + sfm + mode + centroid + meanfun + minfun + maxfun +
## meandom + mindom + maxdom + modindx
##
##
## Step: AIC=412.44
## label ~ meanfreq + sd + median + Q25 + Q75 + IQR + skew + kurt +
## sp.ent + sfm + mode + meanfun + minfun + maxfun + meandom +
## mindom + maxdom + modindx
##
##
## Step: AIC=412.44
## label ~ meanfreq + sd + median + Q25 + Q75 + skew + kurt + sp.ent +
## sfm + mode + meanfun + minfun + maxfun + meandom + mindom +
## maxdom + modindx
##
## Df Deviance AIC
## - maxdom 1 376.45 410.45
## - meandom 1 376.52 410.52
## - skew 1 376.61 410.61
## - maxfun 1 376.63 410.63
## - median 1 377.49 411.49
## - mode 1 377.61 411.61
## - modindx 1 377.73 411.73
## - kurt 1 377.92 411.92
## - mindom 1 378.30 412.30
## - sd 1 378.31 412.31
## <none> 376.44 412.44
## - meanfreq 1 378.72 412.72
## - minfun 1 384.74 418.74
## - sp.ent 1 388.49 422.49
## - Q25 1 390.16 424.16
## - sfm 1 390.22 424.22
## - Q75 1 390.93 424.93
## - meanfun 1 1541.71 1575.71
##
## Step: AIC=410.45
## label ~ meanfreq + sd + median + Q25 + Q75 + skew + kurt + sp.ent +
## sfm + mode + meanfun + minfun + maxfun + meandom + mindom +
## modindx
##
## Df Deviance AIC
## - skew 1 376.62 408.62
## - maxfun 1 376.64 408.64
## - meandom 1 376.67 408.67
## - median 1 377.49 409.49
## - mode 1 377.62 409.62
## - kurt 1 377.94 409.94
## - modindx 1 378.31 410.31
## - sd 1 378.31 410.31
## - mindom 1 378.33 410.33
## <none> 376.45 410.45
## - meanfreq 1 378.73 410.73
## + maxdom 1 376.44 412.44
## + dfrange 1 376.44 412.44
## - minfun 1 384.74 416.74
## - sp.ent 1 388.50 420.50
## - Q25 1 390.16 422.16
## - sfm 1 390.23 422.23
## - Q75 1 390.93 422.93
## - meanfun 1 1541.71 1573.71
##
## Step: AIC=408.62
## label ~ meanfreq + sd + median + Q25 + Q75 + kurt + sp.ent +
## sfm + mode + meanfun + minfun + maxfun + meandom + mindom +
## modindx
##
## Df Deviance AIC
## - maxfun 1 376.78 406.78
## - meandom 1 376.79 406.79
## - mode 1 377.63 407.63
## - median 1 377.66 407.66
## - sd 1 378.35 408.35
## - mindom 1 378.38 408.38
## - modindx 1 378.49 408.49
## <none> 376.62 408.62
## - meanfreq 1 378.82 408.82
## + skew 1 376.45 410.45
## + maxdom 1 376.61 410.61
## + dfrange 1 376.61 410.61
## - minfun 1 384.78 414.78
## - kurt 1 385.89 415.89
## - Q25 1 390.18 420.18
## - sfm 1 390.44 420.44
## - sp.ent 1 390.84 420.84
## - Q75 1 390.93 420.93
## - meanfun 1 1565.21 1595.21
##
## Step: AIC=406.78
## label ~ meanfreq + sd + median + Q25 + Q75 + kurt + sp.ent +
## sfm + mode + meanfun + minfun + meandom + mindom + modindx
##
## Df Deviance AIC
## - meandom 1 376.91 404.91
## - median 1 377.79 405.79
## - mode 1 377.94 405.94
## - mindom 1 378.38 406.38
## - sd 1 378.41 406.41
## - modindx 1 378.49 406.49
## <none> 376.78 406.78
## - meanfreq 1 378.98 406.98
## + maxfun 1 376.62 408.62
## + skew 1 376.64 408.64
## + maxdom 1 376.76 408.76
## + dfrange 1 376.76 408.76
## - minfun 1 385.08 413.08
## - kurt 1 385.89 413.89
## - Q25 1 390.21 418.21
## - sfm 1 390.86 418.86
## - Q75 1 390.96 418.96
## - sp.ent 1 391.00 419.00
## - meanfun 1 1662.73 1690.73
##
## Step: AIC=404.91
## label ~ meanfreq + sd + median + Q25 + Q75 + kurt + sp.ent +
## sfm + mode + meanfun + minfun + mindom + modindx
##
## Df Deviance AIC
## - median 1 377.84 403.84
## - mode 1 378.08 404.08
## - mindom 1 378.47 404.47
## - modindx 1 378.61 404.61
## - sd 1 378.64 404.64
## <none> 376.91 404.91
## - meanfreq 1 379.01 405.01
## + meandom 1 376.78 406.78
## + maxfun 1 376.79 406.79
## + maxdom 1 376.80 406.80
## + dfrange 1 376.80 406.80
## + skew 1 376.81 406.81
## - kurt 1 386.02 412.02
## - minfun 1 387.21 413.21
## - Q25 1 390.68 416.68
## - sfm 1 390.89 416.89
## - Q75 1 390.98 416.98
## - sp.ent 1 391.21 417.21
## - meanfun 1 1670.63 1696.63
##
## Step: AIC=403.84
## label ~ meanfreq + sd + Q25 + Q75 + kurt + sp.ent + sfm + mode +
## meanfun + minfun + mindom + modindx
##
## Df Deviance AIC
## - sd 1 378.76 402.76
## - mode 1 379.26 403.26
## - mindom 1 379.31 403.31
## - meanfreq 1 379.42 403.42
## - modindx 1 379.52 403.52
## <none> 377.84 403.84
## + median 1 376.91 404.91
## + skew 1 377.73 405.73
## + maxfun 1 377.74 405.74
## + meandom 1 377.79 405.79
## + dfrange 1 377.80 405.80
## + maxdom 1 377.80 405.80
## - kurt 1 386.34 410.34
## - minfun 1 387.98 411.98
## - sfm 1 391.37 415.37
## - sp.ent 1 392.11 416.11
## - Q75 1 396.46 420.46
## - Q25 1 403.68 427.68
## - meanfun 1 1684.26 1708.26
##
## Step: AIC=402.76
## label ~ meanfreq + Q25 + Q75 + kurt + sp.ent + sfm + mode + meanfun +
## minfun + mindom + modindx
##
## Df Deviance AIC
## - meanfreq 1 379.83 401.83
## - mindom 1 380.18 402.18
## - modindx 1 380.20 402.20
## - mode 1 380.20 402.20
## <none> 378.76 402.76
## + sd 1 377.84 403.84
## + meandom 1 378.60 404.60
## + median 1 378.64 404.64
## + dfrange 1 378.66 404.66
## + maxdom 1 378.66 404.66
## + maxfun 1 378.73 404.73
## + skew 1 378.75 404.75
## - kurt 1 387.41 409.41
## - minfun 1 392.71 414.71
## - sp.ent 1 397.15 419.15
## - Q75 1 403.22 425.22
## - Q25 1 404.84 426.84
## - sfm 1 406.06 428.06
## - meanfun 1 1692.68 1714.68
##
## Step: AIC=401.83
## label ~ Q25 + Q75 + kurt + sp.ent + sfm + mode + meanfun + minfun +
## mindom + modindx
##
## Df Deviance AIC
## - mode 1 380.64 400.64
## - modindx 1 381.26 401.26
## - mindom 1 381.39 401.39
## <none> 379.83 401.83
## + meanfreq 1 378.76 402.76
## + centroid 1 378.76 402.76
## + median 1 379.39 403.39
## + sd 1 379.42 403.42
## + maxfun 1 379.73 403.73
## + meandom 1 379.73 403.73
## + maxdom 1 379.74 403.74
## + dfrange 1 379.74 403.74
## + skew 1 379.83 403.83
## - kurt 1 388.93 408.93
## - minfun 1 393.46 413.46
## - sp.ent 1 398.12 418.12
## - sfm 1 406.52 426.52
## - Q75 1 434.74 454.74
## - Q25 1 505.79 525.79
## - meanfun 1 1703.69 1723.69
##
## Step: AIC=400.64
## label ~ Q25 + Q75 + kurt + sp.ent + sfm + meanfun + minfun +
## mindom + modindx
##
## Df Deviance AIC
## - mindom 1 381.98 399.98
## - modindx 1 382.39 400.39
## <none> 380.64 400.64
## + mode 1 379.83 401.83
## + sd 1 380.10 402.10
## + meanfreq 1 380.20 402.20
## + centroid 1 380.20 402.20
## + maxfun 1 380.47 402.47
## + meandom 1 380.52 402.52
## + median 1 380.53 402.53
## + dfrange 1 380.55 402.55
## + maxdom 1 380.55 402.55
## + skew 1 380.59 402.59
## - kurt 1 390.93 408.93
## - minfun 1 397.71 415.71
## - sp.ent 1 399.29 417.29
## - sfm 1 408.14 426.14
## - Q75 1 441.65 459.65
## - Q25 1 506.13 524.13
## - meanfun 1 1703.84 1721.84
##
## Step: AIC=399.98
## label ~ Q25 + Q75 + kurt + sp.ent + sfm + meanfun + minfun +
## modindx
##
## Df Deviance AIC
## <none> 381.98 399.98
## + mindom 1 380.64 400.64
## - modindx 1 384.80 400.80
## + mode 1 381.39 401.39
## + meanfreq 1 381.40 401.40
## + centroid 1 381.40 401.40
## + sd 1 381.51 401.51
## + median 1 381.78 401.78
## + dfrange 1 381.89 401.89
## + meandom 1 381.90 401.90
## + maxdom 1 381.90 401.90
## + skew 1 381.90 401.90
## + maxfun 1 381.97 401.97
## - kurt 1 392.55 408.55
## - minfun 1 399.82 415.82
## - sp.ent 1 400.88 416.88
## - sfm 1 409.18 425.18
## - Q75 1 444.68 460.68
## - Q25 1 517.03 533.03
## - meanfun 1 1704.31 1720.31
summary(voicelmstep)
##
## Call:
## glm(formula = label ~ Q25 + Q75 + kurt + sp.ent + sfm + meanfun +
## minfun + modindx, family = "binomial", data = voicetrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2305 -0.1027 -0.0004 0.0344 3.0515
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 15.390921 8.581947 1.793 0.072908 .
## Q25 61.553822 6.018806 10.227 < 2e-16 ***
## Q75 -52.723460 6.856350 -7.690 1.47e-14 ***
## kurt 0.004484 0.001204 3.726 0.000195 ***
## sp.ent -42.917919 10.358461 -4.143 3.42e-05 ***
## sfm 11.330760 2.381448 4.758 1.96e-06 ***
## meanfun 165.105856 10.264849 16.085 < 2e-16 ***
## minfun -43.227295 10.454967 -4.135 3.56e-05 ***
## modindx 2.610746 1.534036 1.702 0.088778 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3074.80 on 2217 degrees of freedom
## Residual deviance: 381.98 on 2209 degrees of freedom
## AIC: 399.98
##
## Number of Fisher Scoring iterations: 8
## 可视化在剔除变量过程中AIC的变化
stepanova <- voicelmstep$anova
stepanova$Step <- as.factor(stepanova$Step)
ggplot(stepanova,aes(x = reorder(Step,-AIC),y = AIC))+
theme_bw(base_family = "STKaiti",base_size = 12)+
geom_point(colour = "red",size = 2)+
geom_text(aes(y = AIC-1,label = round(AIC,2)))+
theme(axis.text.x = element_text(angle = 30,size = 12))+
labs(x = "删除的特征")
## 对比两个模型逐步回归前后的在测试集上预测的精度
voicelmpre <- predict(voicelm,voicetest,type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
voicelmpre2 <- as.factor(ifelse(voicelmpre > 0.5,1,0))
voicesteppre <- predict(voicelmstep,voicetest,type = "response")
voicesteppre2 <- as.factor(ifelse(voicesteppre > 0.5,1,0))
sprintf("逻辑回归模型的精度为:%f",accuracy(voicetest$label,voicelmpre2))
## [1] "逻辑回归模型的精度为:0.968421"
sprintf("逐步逻辑回归模型的精度为:%f",accuracy(voicetest$label,voicesteppre2))
## [1] "逐步逻辑回归模型的精度为:0.968421"
## 计算混淆矩阵
table(voicetest$label,voicelmpre2)
## voicelmpre2
## 0 1
## 0 462 13
## 1 17 458
table(voicetest$label,voicesteppre2)
## voicesteppre2
## 0 1
## 0 461 14
## 1 16 459
## 绘制出ROC曲线对比两种模型的效果
## 计算逻辑回归模型的ROC坐标
pr <- prediction(voicelmpre, voicetest$label)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
prfdf <- data.frame(x = prf@x.values[[1]],logitic = prf@y.values[[1]])
## 计算逐步逻辑回归模型的ROC坐标
pr <- prediction(voicesteppre, voicetest$label)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
## 添加新数据
prfdf$logiticstep <- prf@y.values[[1]]
prfdf2 <- gather(prfdf,key="model",value="y",2:3)
ggplot(prfdf2,aes(x= x,y = y,colour = model,linetype = model))+
theme_bw(base_family = "STKaiti")+geom_line(size = 1)+
theme(aspect.ratio=1)+
labs(x = "假正例率",y = "真正例率")
该数据一共有3个变量,
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 3.0-2
## 读取数据
poi_sim <- read.csv("data/chap5/poisson_sim.csv")
poi_sim <- poi_sim[,2:4]
poi_sim$prog <- factor(poi_sim$prog,levels=1:3,
labels=c("一般", "学术", "职业"))
## 可视化获奖次数的直方图
hist(poi_sim$num_awards)
model <- glm(num_awards~.-1,data = poi_sim,family = poisson(link = "log"))
summary(model)
##
## Call:
## glm(formula = num_awards ~ . - 1, family = poisson(link = "log"),
## data = poi_sim)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2043 -0.8436 -0.5106 0.2558 2.6796
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## prog一般 -5.24712 0.65845 -7.969 1.60e-15 ***
## prog学术 -4.16327 0.66288 -6.281 3.37e-10 ***
## prog职业 -4.87732 0.62818 -7.764 8.21e-15 ***
## math 0.07015 0.01060 6.619 3.63e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 319.24 on 200 degrees of freedom
## Residual deviance: 189.45 on 196 degrees of freedom
## AIC: 373.5
##
## Number of Fisher Scoring iterations: 6
exp(coef(model))
## prog一般 prog学术 prog职业 math
## 0.005262630 0.015556678 0.007617438 1.072671641
Ridge 回归
library(readxl)
library(caret)
library(glmnet)
library(corrplot)
library(Metrics)
library(ggplot2)
## 读取数据
diabete <- read.csv("data/chap5/diabetes.csv",sep = "\t")
## 可视化相关系数
diabete_cor <- cor(diabete)
corrplot.mixed(diabete_cor,tl.col="black",tl.pos = "d",number.cex = 0.8)
## 切分为训练集和测试集,70%训练,30%测试
set.seed(123)
d_index <- createDataPartition(diabete$Y,p = 0.7)
train_d <- diabete[d_index$Resample1,]
test_d <- diabete[-d_index$Resample1,]
## 数据标准化
scal <- preProcess(train_d,method = c("center","scale"))
train_ds <- predict(scal,train_d)
test_ds <- predict(scal,test_d)
## 查看标准化使用的均值和标准差
scal$mean
## AGE SEX BMI BP S1 S2 S3
## 48.610932 1.479100 26.370418 94.855241 190.337621 116.475563 50.522508
## S4 S5 S6 Y
## 4.036592 4.619805 91.305466 151.887460
scal$std
## AGE SEX BMI BP S1 S2 S3
## 12.9363108 0.5003681 4.4834851 13.6416762 34.6470475 30.3885396 13.2357461
## S4 S5 S6 Y
## 1.2961840 0.5170616 11.5498891 76.4088971
ridge 回归
## 在训练集上寻找合适的ridge参数
## 使用交叉验证来分析ridge回归合适的参数
lambdas <- seq(0,5, length.out = 200)
X <- as.matrix(train_ds[,1:10])
Y <- train_ds[,11]
set.seed(1245)
ridge_model <- cv.glmnet(X,Y,alpha = 0,lambda = lambdas,nfolds =3)
## 查看lambda对模型均方误差的影响
plot(ridge_model)
## 可视化ridge模型的回归系数的轨迹线
plot(ridge_model$glmnet.fit, "lambda", label = T)
## 找到使回归效果最好的lambda(均方误差最小)
ridge_min <- ridge_model$lambda.min
ridge_min
## [1] 0.1758794
## 使用ridge_min 拟合ridge模型
ridge_best <- glmnet(X,Y,alpha = 0,lambda = ridge_min)
summary(ridge_best)
## Length Class Mode
## a0 1 -none- numeric
## beta 10 dgCMatrix S4
## df 1 -none- numeric
## dim 2 -none- numeric
## lambda 1 -none- numeric
## dev.ratio 1 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 5 -none- call
## nobs 1 -none- numeric
## 查看ridge回归的系数
coef(ridge_best)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 9.630216e-17
## AGE 6.553009e-03
## SEX -8.009107e-02
## BMI 2.969255e-01
## BP 1.717147e-01
## S1 -1.524490e-02
## S2 -4.969365e-02
## S3 -8.842073e-02
## S4 8.167850e-02
## S5 2.463496e-01
## S6 5.792367e-02
## 预测在测试集上的效果
test_pre <- predict(ridge_best,as.matrix(test_ds[,1:10]))
sprintf("标准化后平均绝对误差为: %f",mae(test_ds$Y,test_pre))
## [1] "标准化后平均绝对误差为: 0.590296"
## 将预测值逆标准化和原始数据进行比较
test_pre_o <- as.vector(test_pre[,1] * scal$std[11] + scal$mean[11])
sprintf("标准化前平均绝对误差为: %f",mae(test_d$Y,test_pre_o))
## [1] "标准化前平均绝对误差为: 45.103901"
## lasso 线性回归
library(readxl)
library(caret)
library(glmnet)
library(corrplot)
library(Metrics)
library(ggplot2)
## 读取数据
diabete <- read.csv("data/chap5/diabetes.csv",sep = "\t")
## 可视化相关系数
diabete_cor <- cor(diabete)
corrplot.mixed(diabete_cor,tl.col="black",tl.pos = "d",number.cex = 0.8)
数据集切分和数据标准化
## 切分为训练集和测试集,70%训练,30%测试
set.seed(123)
d_index <- createDataPartition(diabete$Y,p = 0.7)
train_d <- diabete[d_index$Resample1,]
test_d <- diabete[-d_index$Resample1,]
## 数据标准化
scal <- preProcess(train_d,method = c("center","scale"))
train_ds <- predict(scal,train_d)
test_ds <- predict(scal,test_d)
## 查看标准化使用的均值和标准差
scal$mean
## AGE SEX BMI BP S1 S2 S3
## 48.610932 1.479100 26.370418 94.855241 190.337621 116.475563 50.522508
## S4 S5 S6 Y
## 4.036592 4.619805 91.305466 151.887460
scal$std
## AGE SEX BMI BP S1 S2 S3
## 12.9363108 0.5003681 4.4834851 13.6416762 34.6470475 30.3885396 13.2357461
## S4 S5 S6 Y
## 1.2961840 0.5170616 11.5498891 76.4088971
## 在训练集上寻找合适的lasso参数
## 使用交叉验证来分析lasso回归合适的参数
lambdas <- seq(0,2, length.out = 100)
X <- as.matrix(train_ds[,1:10])
Y <- train_ds[,11]
set.seed(1245)
lasso_model <- cv.glmnet(X,Y,alpha = 1,lambda = lambdas,nfolds =3)
## 查看lambda对模型均方误差的影响
plot(lasso_model)
## 可视化lasso模型的回归系数的轨迹线
plot(lasso_model$glmnet.fit, "lambda", label = T)
## 找到使回归效果最好的lambda(均方误差最小)
lasso_min <- lasso_model$lambda.min
lasso_min
## [1] 0.02020202
## 使得误差在最小值的1个标准误差范围内的lambda的最大值。
lasso_lse <- lasso_model$lambda.1se
lasso_lse
## [1] 0.1616162
## 使用lasso_min 拟合lasso模型
lasso_best <- glmnet(X,Y,alpha = 1,lambda = lasso_min)
summary(lasso_best)
## Length Class Mode
## a0 1 -none- numeric
## beta 10 dgCMatrix S4
## df 1 -none- numeric
## dim 2 -none- numeric
## lambda 1 -none- numeric
## dev.ratio 1 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 5 -none- call
## nobs 1 -none- numeric
## 查看lasso回归的系数
coef(lasso_best)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 7.244990e-17
## AGE .
## SEX -6.269864e-02
## BMI 3.384728e-01
## BP 1.654105e-01
## S1 -3.031252e-02
## S2 .
## S3 -1.005771e-01
## S4 .
## S5 3.031539e-01
## S6 2.740312e-02
## 预测在测试集上的效果
test_pre <- predict(lasso_best,as.matrix(test_ds[,1:10]))
sprintf("标准化后平均绝对误差为: %f",mae(test_ds$Y,test_pre))
## [1] "标准化后平均绝对误差为: 0.585239"
## 将预测值逆标准化和原始数据进行比较
test_pre_o <- as.vector(test_pre[,1] * scal$std[11] + scal$mean[11])
sprintf("标准化前平均绝对误差为: %f",mae(test_d$Y,test_pre_o))
## [1] "标准化前平均绝对误差为: 44.717477"
## 读取数据
voice <- read.csv("data/chap5/voice.csv",stringsAsFactors = F)
voice$label <- factor(voice$label,levels = c("male","female"),labels = c(0,1))
set.seed(123)
## 数据集切分为70%训练集和30%测试集
index <- createDataPartition(voice$label,p = 0.7)
voicetrain <- voice[index$Resample1,]
voicetest <- voice[-index$Resample1,]
## 寻找合适的参数
#lambdas <- seq(1,1000, length.out = 100)
lambdas <- c(0.000001,0.00001,0.0001,0.001,0.01,0.1,0.5,1,2)
X <- as.matrix(voicetrain[,1:20])
Y <- voicetrain$label
lasso_model <- cv.glmnet(X,Y,alpha = 1,lambda = lambdas,nfolds =3,
family = "binomial",type.measure = "class")
## 查看lambda对模型均方误差的影响
plot(lasso_model)
plot(lasso_model$glmnet.fit, "lambda", label = T)
## 找到使回归效果最好的lambda(均方误差最小)
lasso_min <- lasso_model$lambda.min
lasso_min
## [1] 0.01
## lasso广义回归
## 使用lasso_min 拟合lasso模型
lasso_best <- glmnet(X,Y,alpha = 1,lambda = lasso_min,
family = "binomial")
summary(lasso_best)
## Length Class Mode
## a0 1 -none- numeric
## beta 20 dgCMatrix S4
## df 1 -none- numeric
## dim 2 -none- numeric
## lambda 1 -none- numeric
## dev.ratio 1 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 6 -none- call
## nobs 1 -none- numeric
## 查看lasso回归的系数
coef(lasso_best)
## 21 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -11.35938406
## meanfreq .
## sd .
## median .
## Q25 .
## Q75 -8.26087570
## IQR -28.81451361
## skew 0.04379662
## kurt .
## sp.ent .
## sfm .
## mode -0.48769260
## centroid .
## meanfun 116.51155790
## minfun -20.87439430
## maxfun .
## meandom .
## mindom .
## maxdom .
## dfrange .
## modindx .
## 预测在测试集上的效果
test_pre <- predict(lasso_best,as.matrix(voicetest[,1:20]))
test_pre <- as.factor(ifelse(test_pre > 0.5,1,0))
sprintf("在测试集上的预测精度为: %f",accuracy(voicetest$label,test_pre))
## [1] "在测试集上的预测精度为: 0.975789"
## 通过调整分类阈值,分析模型的精度
thresh <- seq(0.05,0.95,by = 0.05)
acc <- thresh
for (ii in 1:length(thresh)){
test_pre <- predict(lasso_best,as.matrix(voicetest[,1:20]))
test_pre <- as.factor(ifelse(test_pre > thresh[ii],1,0))
acc[ii] <- accuracy(voicetest$label,test_pre)
}
## 可视化变化曲线
plotdata <- data.frame(thresh = thresh,acc = acc)
ggplot(plotdata,aes(x = thresh,y = acc))+
theme_bw(base_family = "STKaiti")+
geom_point()+geom_line()+ylim(c(0.95,1))+
scale_x_continuous("分类阈值",thresh)+
labs(y = "模型精度",title = "Lasso广义回归精度")+
theme(plot.title = element_text(hjust = 0.5))