## 1:读取CSV文件
## 方式1
csvdata <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(csvdata)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 6 5.4 3.9 1.7 0.4 Iris-setosa
str(csvdata)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 方式2
csvdata <- read.table("data/chap2/Iris.csv",header = TRUE,sep = ",")
head(csvdata)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 6 5.4 3.9 1.7 0.4 Iris-setosa
str(csvdata)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 方式3 c = character, i = integer, n = number, d = double, l = logical, D = date, ## T = date time, t = time, ? = guess
library(readr)
csvdata <- read_csv("data/chap2/Iris.csv",col_names = TRUE,
col_types = list("d","d","d","d","d","c"))
head(csvdata,2)
## # A tibble: 2 x 6
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3 1.4 0.2 Iris-setosa
str(csvdata)
## tibble [150 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. SepalLengthCm = col_double(),
## .. SepalWidthCm = col_double(),
## .. PetalLengthCm = col_double(),
## .. PetalWidthCm = col_double(),
## .. Species = col_character()
## .. )
## 数据保存为csv
write_csv(csvdata,"data/chap2/IrisWrite_1.csv")
write.csv(csvdata,"data/chap2/IrisWrite_2.csv",quote = FALSE)
## 2: 读取excel数据
library(readxl)
exceldata <- read_excel("data/chap2/Iris.xlsx",sheet = "Iris")
str(exceldata,2)
## tibble [150 × 6] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 读取图片数据
## 读取png图像
library(png)
impng <- readPNG("data/chap2/Rlogo.png")
r <- nrow(impng) / ncol(impng) # image ratio
plot(c(0,1), c(0,r), type = "n", xlab = "", ylab = "", asp=1)
## 该行在Nootbook中不支持,但是在Console中运行正常
rasterImage(impng, 0, 0, 1, r)
str(impng)
## num [1:76, 1:100, 1:4] 0 0 0 0 0 0 0 0 0 0 ...
## load.image 可以读取多种格式的图像
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
imjpg <- load.image("data/chap2/image.jpg")
imdim <- dim(imjpg)
plot(imjpg,xlim = c(1,width(imjpg)),ylim = c(1,height(imjpg)))
### 长宽数据变换
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:imager':
##
## fill
## The following object is masked from 'package:magrittr':
##
## extract
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(Iris,2)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
str(Iris)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 宽数据转化为长数据1
Irislong = gather(Iris,key="varname",value="value",SepalLengthCm:PetalWidthCm)
head(Irislong,2)
## Id Species varname value
## 1 1 Iris-setosa SepalLengthCm 5.1
## 2 2 Iris-setosa SepalLengthCm 4.9
str(Irislong)
## 'data.frame': 600 obs. of 4 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Species: chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## $ varname: chr "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" ...
## $ value : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## 长数据转化为宽数据1
IrisWidth <- spread(Irislong,key="varname",value="value")
head(IrisWidth,2)
## Id Species PetalLengthCm PetalWidthCm SepalLengthCm SepalWidthCm
## 1 1 Iris-setosa 1.4 0.2 5.1 3.5
## 2 2 Iris-setosa 1.4 0.2 4.9 3.0
str(IrisWidth)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:5]
head(Iris,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
str(Iris)
## 'data.frame': 150 obs. of 4 variables:
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## 数据中心化:是指变量减去它的均值;
Irisc <- scale(Iris,center = TRUE, scale = FALSE)
apply(Irisc,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.543333 -1.054 -2.758667 -1.098667
## [2,] 2.056667 1.346 3.141333 1.301333
## 数据标准化:是指数值减去均值,再除以标准差;
## 数据标准化处理
Iriss <- scale(Iris,center = TRUE, scale = TRUE)
apply(Iriss,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.863780 -2.430844 -1.563497 -1.439627
## [2,] 2.483699 3.104284 1.780377 1.705189
## min-max标准化方法是对原始数据进行线性变换。
## 设minA和maxA分别为属性A的最小值和最大值,
## 将A的一个原始值x通过min-max标准化映射成在区间[0,1]中的值
## 新数据=(原数据-最小值)/(最大值-最小值)
minmax <- function(x){
x <- (x-min(x))/(max(x)-min(x))
}
Iris01 <- apply(Iris,2,minmax)
apply(Iris01,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 0 0 0 0
## [2,] 1 1 1 1
## 使用caret包进行处理
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## preProcess得到的结果可以使用predict函数作用于新的数据集
## 而且还包括其他方法,如标准化 "scale", "range", 等
## 1 中心化
center <- preProcess(Iris,method = "center")
Irisc <- predict(center,Iris)
head(Irisc,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 -0.7433333 0.446 -2.358667 -0.9986667
## 2 -0.9433333 -0.054 -2.358667 -0.9986667
apply(Irisc,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.543333 -1.054 -2.758667 -1.098667
## [2,] 2.056667 1.346 3.141333 1.301333
## 2 标准化
scal <- preProcess(Iris,method = c("center","scale"))
Iriss <- predict(scal,Iris)
head(Iriss,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 -0.8976739 1.0286113 -1.336794 -1.308593
## 2 -1.1392005 -0.1245404 -1.336794 -1.308593
apply(Iriss,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.863780 -2.430844 -1.563497 -1.439627
## [2,] 2.483699 3.104284 1.780377 1.705189
## [0-1]化
minmax01 <- preProcess(Iris,method = "range",rangeBounds = c(0,1))
Iris01 <- predict(minmax01,Iris)
apply(Iris01,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 0 0 0 0
## [2,] 1 1 1 1
## 数据集切分
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:6]
head(Iris,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 数据集切分 1
num <- round(nrow(Iris)*0.7)
index <- sample(nrow(Iris),size = num)
Iris_train <- Iris[index,]
Iris_test <- Iris[-index,]
dim(Iris_train)
## [1] 105 5
dim(Iris_test)
## [1] 45 5
## 数据集切分2 使用carte包中的函数
## carte包中切分数据的输出为训练数据集在所有数据中的行位置
## 使用createDataPartition获取数据切分的索引
index = createDataPartition(Iris$Species,p=0.7)
Iris_train <- Iris[index$Resample1,]
Iris_test <- Iris[-index$Resample1,]
dim(Iris_train)
## [1] 105 5
dim(Iris_test)
## [1] 45 5
## 获取数据k折的行位置
index2 <- createFolds(Iris$Species,k = 3)
index2
## $Fold1
## [1] 1 6 13 14 15 19 21 22 23 24 26 28 31 34 35 38 44 51 52
## [20] 56 57 60 62 65 70 72 74 79 81 86 87 88 91 99 104 107 109 111
## [39] 113 115 119 121 124 128 139 141 142 144 145 147
##
## $Fold2
## [1] 3 4 8 9 11 12 16 18 20 29 32 36 43 47 49 50 54 58 59
## [20] 61 69 73 75 77 78 80 83 85 90 92 94 97 100 101 102 108 116 118
## [39] 122 123 126 129 131 133 134 135 136 137 143 149
##
## $Fold3
## [1] 2 5 7 10 17 25 27 30 33 37 39 40 41 42 45 46 48 53 55
## [20] 63 64 66 67 68 71 76 82 84 89 93 95 96 98 103 105 106 110 112
## [39] 114 117 120 125 127 130 132 138 140 146 148 150
iris <- read.csv("data/chap2/Iris.csv")
## 数据的集中趋势
## 均值
apply(iris[,c(2:5)],2,mean)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 5.843333 3.054000 3.758667 1.198667
## 中位数
apply(iris[,c(2:5)],2,median)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 5.80 3.00 4.35 1.30
## 离散程度
## 方差
apply(iris[,c(2:5)],2,var)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.6856935 0.1880040 3.1131794 0.5824143
## 标准差
apply(iris[,c(2:5)],2,sd)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.8280661 0.4335943 1.7644204 0.7631607
## 中位数绝对偏差
apply(iris[,c(2:5)],2,mad)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1.03782 0.37065 1.85325 1.03782
## 变异系数 标准差/均值,越大说明数据越分散
apply(iris[,c(2:5)],2,sd) / apply(iris[,c(2:5)],2,mean)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.1417113 0.1419759 0.4694272 0.6366747
## 四分位数 和 极值
apply(iris[,c(2:5)],2,quantile)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0% 4.3 2.0 1.00 0.1
## 25% 5.1 2.8 1.60 0.3
## 50% 5.8 3.0 4.35 1.3
## 75% 6.4 3.3 5.10 1.8
## 100% 7.9 4.4 6.90 2.5
apply(iris[,c(2:5)],2,fivenum)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 4.3 2.0 1.00 0.1
## [2,] 5.1 2.8 1.60 0.3
## [3,] 5.8 3.0 4.35 1.3
## [4,] 6.4 3.3 5.10 1.8
## [5,] 7.9 4.4 6.90 2.5
apply(iris[,c(2:5)],2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 4.3 2.0 1.0 0.1
## [2,] 7.9 4.4 6.9 2.5
## 四分位数范围 IQR(x) = quantile(x, 3/4) - quantile(x, 1/4).
apply(iris[,c(2:5)],2,IQR)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1.3 0.5 3.5 1.5
## 偏度和峰度,可以使用moments库
library(moments)
apply(iris[,c(2:5)],2,skewness)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.3117531 0.3307028 -0.2717120 -0.1039437
apply(iris[,c(2:5)],2,kurtosis)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 2.426432 3.241443 1.604641 1.664754
library(ggplot2)
library(tidyr)
## 宽数据转化为长数据
irislong = gather(iris[,c(2:5)],key="varname",
value="value",SepalLengthCm:PetalWidthCm)
## 可视化数据的分布
ggplot(irislong,aes(colour = varname,linetype = varname))+
theme_bw()+geom_density(aes(value),bw = 0.5)
## 可视化数据的分布
ggplot(irislong,aes(colour = varname,fill = varname,linetype = varname))+
theme_bw()+geom_density(aes(value),bw = 0.5,alpha = 0.4)
plot(density(iris$SepalWidthCm))
skewness(iris$SepalWidthCm)
## [1] 0.3307028
## 相关系数
cor(iris[,c(2:5)])
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm 1.0000000 -0.1093692 0.8717542 0.8179536
## SepalWidthCm -0.1093692 1.0000000 -0.4205161 -0.3565441
## PetalLengthCm 0.8717542 -0.4205161 1.0000000 0.9627571
## PetalWidthCm 0.8179536 -0.3565441 0.9627571 1.0000000
## 数据之间的距离
## 计算3种花之间的4个特征均值,然后计算他们之间的距离
## 数据准备
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
newdata <- iris%>%group_by(Species)%>%
summarise(SepalLengthMean = mean(SepalLengthCm),
SepalWidthMean = mean(SepalWidthCm),
PetalLengthMean = mean(PetalLengthCm),
PetalWidthMean = mean(PetalWidthCm))
## `summarise()` ungrouping output (override with `.groups` argument)
rownames(newdata) <- newdata$Species
## Warning: Setting row names on a tibble is deprecated.
newdata$Species <- NULL
newdata
## # A tibble: 3 x 4
## SepalLengthMean SepalWidthMean PetalLengthMean PetalWidthMean
## <dbl> <dbl> <dbl> <dbl>
## 1 5.01 3.42 1.46 0.244
## 2 5.94 2.77 4.26 1.33
## 3 6.59 2.97 5.55 2.03
## 欧式距离等
dist(newdata,method = "euclidean",upper = T,diag = T)
## 1 2 3
## 1 0.000000 3.205175 4.752592
## 2 3.205175 0.000000 1.620489
## 3 4.752592 1.620489 0.000000
## 曼哈顿距离
dist(newdata,method = "manhattan",upper = T,diag = T)
## 1 2 3
## 1 0.000 5.456 7.896
## 2 5.456 0.000 2.848
## 3 7.896 2.848 0.000
## maximum
dist(newdata,method = "maximum",upper = T,diag = T)
## 1 2 3
## 1 0.000 2.796 4.088
## 2 2.796 0.000 1.292
## 3 4.088 1.292 0.000
## canberra
dist(newdata,method = "canberra",upper = T,diag = T)
## 1 2 3
## 1 0.0000000 1.3673540 1.5736019
## 2 1.3673540 0.0000000 0.4280814
## 3 1.5736019 0.4280814 0.0000000
## minkowski
dist(newdata,method = "minkowski",upper = T,diag = T,p = 0.5)
## 1 2 3
## 1 0.00000 20.08533 27.88796
## 2 20.08533 0.00000 10.44874
## 3 27.88796 10.44874 0.00000
很多时候数据不会是完整的,会存在有缺失值的情况,这时需要对缺失的数据进行处理。
##读取数据
myair <- read.csv("data/chap2/myairquality.csv")
dim(myair)
## [1] 153 7
summary(myair)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.70 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.40 1st Qu.:72.25
## Median : 31.50 Median :205.0 Median : 9.70 Median :79.00
## Mean : 42.13 Mean :185.9 Mean :10.01 Mean :77.87
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.50 3rd Qu.:84.00
## Max. :168.00 Max. :334.0 Max. :20.70 Max. :97.00
## NA's :37 NA's :7 NA's :4 NA's :3
## Month Day Type
## Min. :5.000 Min. : 1.00 Length:153
## 1st Qu.:6.000 1st Qu.: 8.00 Class :character
## Median :7.000 Median :16.00 Mode :character
## Mean :6.993 Mean :15.71
## 3rd Qu.:8.000 3rd Qu.:23.00
## Max. :9.000 Max. :31.00
## NA's :3 NA's :6
## 1:检查数据是否存在缺失值
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
##
## Attaching package: 'grid'
## The following object is masked from 'package:imager':
##
## depth
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
## 可视化查看数据是否有缺失值
aggr(myair)
## complete.cases()输出样例是否包含缺失值
## 输出包含缺失值的样例
mynadata <- myair[!complete.cases(myair),]
dim(mynadata)
## [1] 57 7
head(mynadata)
## Ozone Solar.R Wind Temp Month Day Type
## 2 36 118 8.0 72 5 2 <NA>
## 5 NA NA 14.3 56 5 5 C
## 6 28 NA 14.9 66 5 6 B
## 8 19 99 13.8 NA 5 8 A
## 10 NA 194 8.6 69 5 10 C
## 11 7 NA 6.9 74 5 11 B
## matrixplot()可视化缺失值的详细情况
## 红色代表缺失数据的情况
matrixplot(mynadata)
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
## 只保留没有缺失值的样例
newdata <- na.omit(myair)
dim(newdata)
## [1] 96 7
head(newdata)
## Ozone Solar.R Wind Temp Month Day Type
## 1 41 190 7.4 67 5 1 A
## 3 12 149 12.6 74 5 3 A
## 4 18 313 11.5 62 5 4 C
## 7 23 299 8.6 65 5 7 B
## 9 8 19 20.1 61 5 9 C
## 12 16 256 9.7 69 5 12 B
## 简单的方法
## 针对不同的情况和变量属性,可以使用不同的缺失值处理方法
## 1: 填补缺失值:
## 均值,中位数,众数等
## is.na()查看Ozone(臭氧)数据缺失值的位置
myair2 <- myair
## 使用均值填补缺失值
myair2$Ozone[is.na(myair$Ozone)] <- mean(myair$Ozone,na.rm = TRUE)
## 输出哪些位置有缺失值
which(is.na(myair$Solar.R))
## [1] 5 6 11 27 96 97 98
## 使用中位数填补缺失值
myair2$Solar.R[which(is.na(myair$Solar.R))] <- median(myair2$Solar.R,na.rm = TRUE)
## 使用前面的或者后面的数据填补缺失值
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## 使用前面或者后面的值来填补缺失值
myair2$Wind <- na.locf(myair$Wind)
myair2$Temp <- na.locf(myair$Temp,fromLast = TRUE)
## 数据中月份数据可以使用前面和后面数据的平均值来填补
## 找到缺失值的位置
naindex <- which(is.na(myair$Month))
newnamonth <- round((myair$Month[naindex-1] + myair$Month[naindex+1]) / 2)
myair2$Month[naindex] <- newnamonth
## 日期数据根据数据情况可以使用前面的数值+1
naindex <- which(is.na(myair$Day))
newnaday <- myair$Day[naindex-1] + 1
myair2$Day[naindex] <- newnaday
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:imager':
##
## label
## The following objects are masked from 'package:base':
##
## format.pval, units
## 使用众数填补缺失值Type变量
## 找出众数
table(myair$Type)
##
## A B C
## 39 51 58
myair2$Type <- impute(myair$Type,"C")
## 观察处理后新数据集的缺失值情况
aggr(myair2)
复杂的数据缺失值处理方法
## 复杂的缺失值处理方法
colnames(myair)
## [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day" "Type"
## 考虑"Ozone" "Solar.R" "Wind" "Temp"之间有关系对四个特征进行缺失值处理
## 提取数据
myair <- myair[,c(1:4)]
## 使用KNN方法来填补缺失值
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'DMwR2'
## The following object is masked from 'package:VIM':
##
## kNN
myair2 <- knnImputation(myair,k=5,scale = TRUE,meth = "weighAvg")
## 使用随机森林的方式填补缺失值
library(missForest)
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:imager':
##
## grow
## Loading required package: foreach
## Loading required package: itertools
## Loading required package: iterators
##
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
##
## nrmse
myair2 <- missForest(myair,ntree = 50)
## missForest iteration 1 in progress...done!
## missForest iteration 2 in progress...done!
## missForest iteration 3 in progress...done!
## 填补缺失值后的数据
myair2$ximp
## Ozone Solar.R Wind Temp
## 1 41.00000 190.000 7.400 67.00000
## 2 36.00000 118.000 8.000 72.00000
## 3 12.00000 149.000 12.600 74.00000
## 4 18.00000 313.000 11.500 62.00000
## 5 19.32000 153.360 14.300 56.00000
## 6 28.00000 247.912 14.900 66.00000
## 7 23.00000 299.000 8.600 65.00000
## 8 19.00000 99.000 13.800 72.62000
## 9 8.00000 19.000 20.100 61.00000
## 10 24.74000 194.000 8.600 69.00000
## 11 7.00000 54.980 6.900 74.00000
## 12 16.00000 256.000 9.700 69.00000
## 13 11.00000 290.000 9.200 66.00000
## 14 14.00000 274.000 10.900 68.00000
## 15 18.00000 65.000 13.200 58.00000
## 16 14.00000 334.000 11.500 64.00000
## 17 34.00000 307.000 12.000 66.00000
## 18 6.00000 78.000 18.400 57.00000
## 19 30.00000 322.000 11.500 68.00000
## 20 11.00000 44.000 9.700 62.00000
## 21 1.00000 8.000 9.700 59.00000
## 22 11.00000 320.000 11.608 73.00000
## 23 4.00000 25.000 9.700 61.00000
## 24 32.00000 92.000 12.000 61.00000
## 25 12.50000 66.000 16.600 57.00000
## 26 19.10000 266.000 14.900 58.00000
## 27 26.18000 154.760 8.000 57.00000
## 28 23.00000 13.000 12.000 67.00000
## 29 45.00000 252.000 14.900 81.00000
## 30 115.00000 223.000 5.700 79.00000
## 31 37.00000 279.000 7.400 76.00000
## 32 40.57867 286.000 8.600 78.00000
## 33 22.48000 287.000 9.700 74.00000
## 34 20.20000 242.000 16.100 67.00000
## 35 50.38000 186.000 9.200 84.00000
## 36 80.66000 220.000 8.600 85.00000
## 37 34.33500 264.000 14.300 79.00000
## 38 29.00000 127.000 9.700 82.00000
## 39 75.58000 273.000 6.900 87.00000
## 40 71.00000 291.000 13.800 90.00000
## 41 39.00000 323.000 11.500 87.00000
## 42 81.64000 259.000 10.900 93.00000
## 43 84.46000 250.000 9.200 92.00000
## 44 23.00000 148.000 8.000 82.00000
## 45 35.38833 332.000 13.800 80.00000
## 46 32.57333 322.000 11.500 79.00000
## 47 21.00000 191.000 14.900 77.00000
## 48 37.00000 284.000 20.700 72.00000
## 49 20.00000 37.000 9.200 65.00000
## 50 12.00000 120.000 11.500 73.00000
## 51 13.00000 137.000 10.300 76.00000
## 52 36.14000 150.000 6.300 77.00000
## 53 71.37800 59.000 1.700 76.00000
## 54 59.72000 91.000 4.600 76.00000
## 55 37.06000 250.000 6.300 76.00000
## 56 19.44000 135.000 8.000 75.00000
## 57 33.75067 127.000 8.000 78.00000
## 58 10.24800 47.000 10.300 73.00000
## 59 32.48000 98.000 11.500 80.00000
## 60 13.02800 31.000 14.900 77.00000
## 61 36.34000 138.000 8.000 83.00000
## 62 135.00000 269.000 4.100 84.00000
## 63 49.00000 248.000 9.200 85.00000
## 64 32.00000 236.000 9.200 81.00000
## 65 38.66000 101.000 10.900 84.00000
## 66 64.00000 175.000 4.600 83.00000
## 67 40.00000 314.000 10.900 83.00000
## 68 77.00000 276.000 5.100 88.00000
## 69 97.00000 267.000 6.300 92.00000
## 70 97.00000 272.000 5.700 92.00000
## 71 85.00000 175.000 6.118 89.00000
## 72 22.32067 139.000 8.600 82.00000
## 73 10.00000 264.000 14.300 73.00000
## 74 27.00000 175.000 14.900 81.00000
## 75 25.64000 291.000 14.900 68.19567
## 76 7.00000 48.000 14.300 80.00000
## 77 48.00000 260.000 6.900 81.00000
## 78 35.00000 274.000 10.300 82.00000
## 79 61.00000 285.000 6.300 84.00000
## 80 79.00000 187.000 5.712 87.00000
## 81 63.00000 220.000 11.500 85.00000
## 82 16.00000 7.000 6.900 74.00000
## 83 49.49800 258.000 9.700 81.00000
## 84 35.23333 295.000 11.500 82.00000
## 85 80.00000 294.000 8.600 86.00000
## 86 108.00000 223.000 8.000 85.00000
## 87 20.00000 81.000 8.600 82.00000
## 88 52.00000 82.000 12.000 86.00000
## 89 82.00000 213.000 7.400 88.00000
## 90 50.00000 275.000 7.400 80.88000
## 91 64.00000 253.000 7.400 83.00000
## 92 59.00000 254.000 9.200 81.00000
## 93 39.00000 83.000 6.900 81.00000
## 94 9.00000 24.000 13.800 81.00000
## 95 16.00000 77.000 7.400 82.00000
## 96 78.00000 238.720 6.900 86.00000
## 97 35.00000 192.120 7.400 85.00000
## 98 66.00000 212.420 4.600 87.00000
## 99 122.00000 255.000 4.000 89.00000
## 100 89.00000 229.000 10.300 90.00000
## 101 110.00000 207.000 8.000 90.00000
## 102 95.90000 222.000 8.600 92.00000
## 103 47.62000 137.000 11.500 86.00000
## 104 44.00000 192.000 11.500 86.00000
## 105 28.00000 273.000 11.500 82.00000
## 106 65.00000 157.000 9.700 80.00000
## 107 27.24000 64.000 11.500 79.00000
## 108 22.00000 71.000 10.300 77.00000
## 109 59.00000 51.000 6.300 79.00000
## 110 23.00000 115.000 7.400 76.00000
## 111 31.00000 244.000 10.900 78.00000
## 112 44.00000 190.000 10.300 78.00000
## 113 21.00000 259.000 15.500 77.00000
## 114 9.00000 36.000 14.300 72.00000
## 115 17.34000 255.000 12.600 75.00000
## 116 45.00000 212.000 9.700 79.00000
## 117 168.00000 238.000 3.400 81.00000
## 118 73.00000 215.000 8.000 86.00000
## 119 76.74000 153.000 5.700 88.00000
## 120 76.00000 203.000 9.700 97.00000
## 121 118.00000 225.000 6.348 94.00000
## 122 84.00000 237.000 6.300 96.00000
## 123 85.00000 188.000 6.300 94.00000
## 124 96.00000 167.000 6.900 91.00000
## 125 78.00000 197.000 5.100 92.00000
## 126 73.00000 183.000 2.800 93.00000
## 127 91.00000 189.000 4.600 93.00000
## 128 47.00000 95.000 7.400 87.00000
## 129 32.00000 92.000 15.500 84.00000
## 130 20.00000 252.000 10.900 80.00000
## 131 23.00000 220.000 10.300 78.00000
## 132 21.00000 230.000 10.900 75.00000
## 133 24.00000 259.000 9.700 73.00000
## 134 44.00000 236.000 14.900 81.00000
## 135 21.00000 259.000 15.500 76.00000
## 136 28.00000 238.000 6.300 77.00000
## 137 9.00000 24.000 10.900 71.00000
## 138 13.00000 112.000 11.500 71.00000
## 139 46.00000 237.000 6.900 78.00000
## 140 18.00000 224.000 13.800 67.00000
## 141 13.00000 27.000 10.300 76.00000
## 142 24.00000 238.000 10.300 68.00000
## 143 16.00000 201.000 8.000 82.00000
## 144 13.00000 238.000 12.600 64.00000
## 145 23.00000 14.000 9.200 71.00000
## 146 36.00000 139.000 10.300 81.00000
## 147 7.00000 49.000 10.300 69.00000
## 148 14.00000 20.000 16.600 63.00000
## 149 30.00000 193.000 6.900 70.00000
## 150 17.76000 145.000 13.200 77.00000
## 151 14.00000 191.000 14.300 75.00000
## 152 18.00000 131.000 8.000 76.00000
## 153 20.00000 223.000 11.500 68.00000
## OOB误差
myair2$OOBerror
## NRMSE
## 0.5566858
## 缺失值多重插补
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:imager':
##
## squeeze
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## 进行链式方程的多元插补
## m:多重插补的数量
## method : 指定插补方法
## norm.predict : 线性回归预测;pmm:均值插补方法,rf: 随机森林方法
## norm:贝叶斯线性回归
impdta <- mice(myair,m = 5,method=c("norm.predict","pmm","rf","norm"))
##
## iter imp variable
## 1 1 Ozone Solar.R Wind Temp
## 1 2 Ozone Solar.R Wind Temp
## 1 3 Ozone Solar.R Wind Temp
## 1 4 Ozone Solar.R Wind Temp
## 1 5 Ozone Solar.R Wind Temp
## 2 1 Ozone Solar.R Wind Temp
## 2 2 Ozone Solar.R Wind Temp
## 2 3 Ozone Solar.R Wind Temp
## 2 4 Ozone Solar.R Wind Temp
## 2 5 Ozone Solar.R Wind Temp
## 3 1 Ozone Solar.R Wind Temp
## 3 2 Ozone Solar.R Wind Temp
## 3 3 Ozone Solar.R Wind Temp
## 3 4 Ozone Solar.R Wind Temp
## 3 5 Ozone Solar.R Wind Temp
## 4 1 Ozone Solar.R Wind Temp
## 4 2 Ozone Solar.R Wind Temp
## 4 3 Ozone Solar.R Wind Temp
## 4 4 Ozone Solar.R Wind Temp
## 4 5 Ozone Solar.R Wind Temp
## 5 1 Ozone Solar.R Wind Temp
## 5 2 Ozone Solar.R Wind Temp
## 5 3 Ozone Solar.R Wind Temp
## 5 4 Ozone Solar.R Wind Temp
## 5 5 Ozone Solar.R Wind Temp
summary(impdta)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## Ozone Solar.R Wind Temp
## "norm.predict" "pmm" "rf" "norm"
## PredictorMatrix:
## Ozone Solar.R Wind Temp
## Ozone 0 1 1 1
## Solar.R 1 0 1 1
## Wind 1 1 0 1
## Temp 1 1 1 0