数据的预处理

数据获取

读取各种各样的数据到R中

## 1:读取CSV文件
## 方式1
csvdata <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(csvdata)

##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa
## 3  3           4.7          3.2           1.3          0.2 Iris-setosa
## 4  4           4.6          3.1           1.5          0.2 Iris-setosa
## 5  5           5.0          3.6           1.4          0.2 Iris-setosa
## 6  6           5.4          3.9           1.7          0.4 Iris-setosa

str(csvdata)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 方式2
csvdata <- read.table("data/chap2/Iris.csv",header = TRUE,sep = ",")
head(csvdata)

##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa
## 3  3           4.7          3.2           1.3          0.2 Iris-setosa
## 4  4           4.6          3.1           1.5          0.2 Iris-setosa
## 5  5           5.0          3.6           1.4          0.2 Iris-setosa
## 6  6           5.4          3.9           1.7          0.4 Iris-setosa

str(csvdata)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 方式3  c = character, i = integer, n = number, d = double, l = logical, D = date, ## T = date time, t = time, ? = guess
library(readr)
csvdata <- read_csv("data/chap2/Iris.csv",col_names = TRUE,
                    col_types = list("d","d","d","d","d","c"))
head(csvdata,2)

## # A tibble: 2 x 6
##      Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species    
##   <dbl>         <dbl>        <dbl>         <dbl>        <dbl> <chr>      
## 1     1           5.1          3.5           1.4          0.2 Iris-setosa
## 2     2           4.9          3             1.4          0.2 Iris-setosa

str(csvdata)

## tibble [150 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id           : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   SepalLengthCm = col_double(),
##   ..   SepalWidthCm = col_double(),
##   ..   PetalLengthCm = col_double(),
##   ..   PetalWidthCm = col_double(),
##   ..   Species = col_character()
##   .. )

## 数据保存为csv
write_csv(csvdata,"data/chap2/IrisWrite_1.csv")

write.csv(csvdata,"data/chap2/IrisWrite_2.csv",quote = FALSE)

## 2: 读取excel数据
library(readxl)
exceldata <- read_excel("data/chap2/Iris.xlsx",sheet = "Iris")
str(exceldata,2)

## tibble [150 × 6] (S3: tbl_df/tbl/data.frame)
##  $ Id           : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 读取图片数据
## 读取png图像
library(png)
impng <- readPNG("data/chap2/Rlogo.png")
r <- nrow(impng) / ncol(impng) # image ratio
plot(c(0,1), c(0,r), type = "n", xlab = "", ylab = "", asp=1)
## 该行在Nootbook中不支持，但是在Console中运行正常
rasterImage(impng, 0, 0, 1, r)

str(impng)

##  num [1:76, 1:100, 1:4] 0 0 0 0 0 0 0 0 0 0 ...

## load.image 可以读取多种格式的图像
library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

imjpg <- load.image("data/chap2/image.jpg")
imdim <- dim(imjpg)
plot(imjpg,xlim = c(1,width(imjpg)),ylim = c(1,height(imjpg)))

数据操作

长宽数据变换,数据标准化处理,数据集切分

长宽数据变换

### 长宽数据变换
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:imager':
## 
##     fill

## The following object is masked from 'package:magrittr':
## 
##     extract

Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(Iris,2)

##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa

str(Iris)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 宽数据转化为长数据1
Irislong = gather(Iris,key="varname",value="value",SepalLengthCm:PetalWidthCm)
head(Irislong,2)

##   Id     Species       varname value
## 1  1 Iris-setosa SepalLengthCm   5.1
## 2  2 Iris-setosa SepalLengthCm   4.9

str(Irislong)

## 'data.frame':    600 obs. of  4 variables:
##  $ Id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Species: chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  $ varname: chr  "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" ...
##  $ value  : num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...

## 长数据转化为宽数据1
IrisWidth <- spread(Irislong,key="varname",value="value")
head(IrisWidth,2)

##   Id     Species PetalLengthCm PetalWidthCm SepalLengthCm SepalWidthCm
## 1  1 Iris-setosa           1.4          0.2           5.1          3.5
## 2  2 Iris-setosa           1.4          0.2           4.9          3.0

str(IrisWidth)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...

数据标准化

Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:5]
head(Iris,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1           5.1          3.5           1.4          0.2
## 2           4.9          3.0           1.4          0.2

str(Iris)

## 'data.frame':    150 obs. of  4 variables:
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...

## 数据中心化：是指变量减去它的均值；
Irisc <- scale(Iris,center = TRUE, scale = FALSE)
apply(Irisc,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.543333       -1.054     -2.758667    -1.098667
## [2,]      2.056667        1.346      3.141333     1.301333

## 数据标准化：是指数值减去均值，再除以标准差；
## 数据标准化处理
Iriss <- scale(Iris,center = TRUE, scale = TRUE)
apply(Iriss,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.863780    -2.430844     -1.563497    -1.439627
## [2,]      2.483699     3.104284      1.780377     1.705189

## min-max标准化方法是对原始数据进行线性变换。
## 设minA和maxA分别为属性A的最小值和最大值，
## 将A的一个原始值x通过min-max标准化映射成在区间[0,1]中的值
## 新数据=（原数据-最小值）/（最大值-最小值）
minmax <- function(x){
  x <- (x-min(x))/(max(x)-min(x))
}

Iris01 <- apply(Iris,2,minmax)
apply(Iris01,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]             0            0             0            0
## [2,]             1            1             1            1

## 使用caret包进行处理
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## preProcess得到的结果可以使用predict函数作用于新的数据集
## 而且还包括其他方法，如标准化 "scale", "range", 等
## 1 中心化
center <- preProcess(Iris,method = "center")
Irisc <- predict(center,Iris)
head(Irisc,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1    -0.7433333        0.446     -2.358667   -0.9986667
## 2    -0.9433333       -0.054     -2.358667   -0.9986667

apply(Irisc,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.543333       -1.054     -2.758667    -1.098667
## [2,]      2.056667        1.346      3.141333     1.301333

## 2 标准化
scal <- preProcess(Iris,method = c("center","scale"))
Iriss <- predict(scal,Iris)
head(Iriss,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1    -0.8976739    1.0286113     -1.336794    -1.308593
## 2    -1.1392005   -0.1245404     -1.336794    -1.308593

apply(Iriss,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.863780    -2.430844     -1.563497    -1.439627
## [2,]      2.483699     3.104284      1.780377     1.705189

## [0-1]化
minmax01 <- preProcess(Iris,method = "range",rangeBounds = c(0,1))
Iris01 <- predict(minmax01,Iris)
apply(Iris01,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]             0            0             0            0
## [2,]             1            1             1            1

## 数据集切分
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:6]
head(Iris,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1           5.1          3.5           1.4          0.2 Iris-setosa
## 2           4.9          3.0           1.4          0.2 Iris-setosa

## 数据集切分 1
num <- round(nrow(Iris)*0.7)
index <- sample(nrow(Iris),size = num)
Iris_train <- Iris[index,]
Iris_test <- Iris[-index,]
dim(Iris_train)

## [1] 105   5

dim(Iris_test)

## [1] 45  5

## 数据集切分2 使用carte包中的函数
## carte包中切分数据的输出为训练数据集在所有数据中的行位置
## 使用createDataPartition获取数据切分的索引
index = createDataPartition(Iris$Species,p=0.7)
Iris_train <- Iris[index$Resample1,]
Iris_test <- Iris[-index$Resample1,]
dim(Iris_train)

## [1] 105   5

dim(Iris_test)

## [1] 45  5

## 获取数据k折的行位置
index2 <- createFolds(Iris$Species,k = 3)
index2

## $Fold1
##  [1]   1   6  13  14  15  19  21  22  23  24  26  28  31  34  35  38  44  51  52
## [20]  56  57  60  62  65  70  72  74  79  81  86  87  88  91  99 104 107 109 111
## [39] 113 115 119 121 124 128 139 141 142 144 145 147
## 
## $Fold2
##  [1]   3   4   8   9  11  12  16  18  20  29  32  36  43  47  49  50  54  58  59
## [20]  61  69  73  75  77  78  80  83  85  90  92  94  97 100 101 102 108 116 118
## [39] 122 123 126 129 131 133 134 135 136 137 143 149
## 
## $Fold3
##  [1]   2   5   7  10  17  25  27  30  33  37  39  40  41  42  45  46  48  53  55
## [20]  63  64  66  67  68  71  76  82  84  89  93  95  96  98 103 105 106 110 112
## [39] 114 117 120 125 127 130 132 138 140 146 148 150

数据描述

集中趋势，离散程度、偏度和峰度

iris <- read.csv("data/chap2/Iris.csv")

## 数据的集中趋势
## 均值
apply(iris[,c(2:5)],2,mean)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##      5.843333      3.054000      3.758667      1.198667

## 中位数
apply(iris[,c(2:5)],2,median)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##          5.80          3.00          4.35          1.30

## 离散程度
## 方差
apply(iris[,c(2:5)],2,var)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.6856935     0.1880040     3.1131794     0.5824143

## 标准差
apply(iris[,c(2:5)],2,sd)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.8280661     0.4335943     1.7644204     0.7631607

## 中位数绝对偏差
apply(iris[,c(2:5)],2,mad)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##       1.03782       0.37065       1.85325       1.03782

## 变异系数 标准差／均值,越大说明数据越分散
apply(iris[,c(2:5)],2,sd) / apply(iris[,c(2:5)],2,mean)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.1417113     0.1419759     0.4694272     0.6366747

## 四分位数 和 极值
apply(iris[,c(2:5)],2,quantile)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0%             4.3          2.0          1.00          0.1
## 25%            5.1          2.8          1.60          0.3
## 50%            5.8          3.0          4.35          1.3
## 75%            6.4          3.3          5.10          1.8
## 100%           7.9          4.4          6.90          2.5

apply(iris[,c(2:5)],2,fivenum)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]           4.3          2.0          1.00          0.1
## [2,]           5.1          2.8          1.60          0.3
## [3,]           5.8          3.0          4.35          1.3
## [4,]           6.4          3.3          5.10          1.8
## [5,]           7.9          4.4          6.90          2.5

apply(iris[,c(2:5)],2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]           4.3          2.0           1.0          0.1
## [2,]           7.9          4.4           6.9          2.5

## 四分位数范围 IQR(x) = quantile(x, 3/4) - quantile(x, 1/4).
apply(iris[,c(2:5)],2,IQR)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##           1.3           0.5           3.5           1.5

## 偏度和峰度,可以使用moments库
library(moments)
apply(iris[,c(2:5)],2,skewness)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.3117531     0.3307028    -0.2717120    -0.1039437

apply(iris[,c(2:5)],2,kurtosis)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##      2.426432      3.241443      1.604641      1.664754

library(ggplot2)
library(tidyr)
## 宽数据转化为长数据
irislong = gather(iris[,c(2:5)],key="varname",
                  value="value",SepalLengthCm:PetalWidthCm)
## 可视化数据的分布
ggplot(irislong,aes(colour = varname,linetype = varname))+
  theme_bw()+geom_density(aes(value),bw = 0.5)

## 可视化数据的分布
ggplot(irislong,aes(colour = varname,fill = varname,linetype = varname))+
  theme_bw()+geom_density(aes(value),bw = 0.5,alpha = 0.4)

plot(density(iris$SepalWidthCm))

skewness(iris$SepalWidthCm)

## [1] 0.3307028

数据相似性度量

## 相关系数
cor(iris[,c(2:5)])

##               SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm     1.0000000   -0.1093692     0.8717542    0.8179536
## SepalWidthCm     -0.1093692    1.0000000    -0.4205161   -0.3565441
## PetalLengthCm     0.8717542   -0.4205161     1.0000000    0.9627571
## PetalWidthCm      0.8179536   -0.3565441     0.9627571    1.0000000

## 数据之间的距离
## 计算3种花之间的4个特征均值，然后计算他们之间的距离
## 数据准备
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

newdata <- iris%>%group_by(Species)%>%
  summarise(SepalLengthMean = mean(SepalLengthCm),
            SepalWidthMean = mean(SepalWidthCm),
            PetalLengthMean = mean(PetalLengthCm),
            PetalWidthMean = mean(PetalWidthCm))

## `summarise()` ungrouping output (override with `.groups` argument)

rownames(newdata) <- newdata$Species

## Warning: Setting row names on a tibble is deprecated.

newdata$Species <- NULL
newdata

## # A tibble: 3 x 4
##   SepalLengthMean SepalWidthMean PetalLengthMean PetalWidthMean
##             <dbl>          <dbl>           <dbl>          <dbl>
## 1            5.01           3.42            1.46          0.244
## 2            5.94           2.77            4.26          1.33 
## 3            6.59           2.97            5.55          2.03

##  欧式距离等
dist(newdata,method = "euclidean",upper = T,diag = T)

##          1        2        3
## 1 0.000000 3.205175 4.752592
## 2 3.205175 0.000000 1.620489
## 3 4.752592 1.620489 0.000000

## 曼哈顿距离
dist(newdata,method = "manhattan",upper = T,diag = T)

##       1     2     3
## 1 0.000 5.456 7.896
## 2 5.456 0.000 2.848
## 3 7.896 2.848 0.000

## maximum
dist(newdata,method = "maximum",upper = T,diag = T)

##       1     2     3
## 1 0.000 2.796 4.088
## 2 2.796 0.000 1.292
## 3 4.088 1.292 0.000

## canberra
dist(newdata,method = "canberra",upper = T,diag = T)

##           1         2         3
## 1 0.0000000 1.3673540 1.5736019
## 2 1.3673540 0.0000000 0.4280814
## 3 1.5736019 0.4280814 0.0000000

## minkowski
dist(newdata,method = "minkowski",upper = T,diag = T,p = 0.5)

##          1        2        3
## 1  0.00000 20.08533 27.88796
## 2 20.08533  0.00000 10.44874
## 3 27.88796 10.44874  0.00000

2.2：数据缺失值处理

很多时候数据不会是完整的，会存在有缺失值的情况，这时需要对缺失的数据进行处理。

##读取数据
myair <- read.csv("data/chap2/myairquality.csv")
dim(myair)

## [1] 153   7

summary(myair)

##      Ozone           Solar.R           Wind            Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.70   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.40   1st Qu.:72.25  
##  Median : 31.50   Median :205.0   Median : 9.70   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   :10.01   Mean   :77.87  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.50   3rd Qu.:84.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.70   Max.   :97.00  
##  NA's   :37       NA's   :7       NA's   :4       NA's   :3      
##      Month            Day            Type          
##  Min.   :5.000   Min.   : 1.00   Length:153        
##  1st Qu.:6.000   1st Qu.: 8.00   Class :character  
##  Median :7.000   Median :16.00   Mode  :character  
##  Mean   :6.993   Mean   :15.71                     
##  3rd Qu.:8.000   3rd Qu.:23.00                     
##  Max.   :9.000   Max.   :31.00                     
##  NA's   :3       NA's   :6

## 1:检查数据是否存在缺失值
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## 
## Attaching package: 'grid'

## The following object is masked from 'package:imager':
## 
##     depth

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

## 可视化查看数据是否有缺失值
aggr(myair)

## complete.cases()输出样例是否包含缺失值
## 输出包含缺失值的样例
mynadata <- myair[!complete.cases(myair),]
dim(mynadata)

## [1] 57  7

head(mynadata)

##    Ozone Solar.R Wind Temp Month Day Type
## 2     36     118  8.0   72     5   2 <NA>
## 5     NA      NA 14.3   56     5   5    C
## 6     28      NA 14.9   66     5   6    B
## 8     19      99 13.8   NA     5   8    A
## 10    NA     194  8.6   69     5  10    C
## 11     7      NA  6.9   74     5  11    B

## matrixplot()可视化缺失值的详细情况
## 红色代表缺失数据的情况
matrixplot(mynadata)

## 
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.

## 只保留没有缺失值的样例
newdata <- na.omit(myair)
dim(newdata)

## [1] 96  7

head(newdata)

##    Ozone Solar.R Wind Temp Month Day Type
## 1     41     190  7.4   67     5   1    A
## 3     12     149 12.6   74     5   3    A
## 4     18     313 11.5   62     5   4    C
## 7     23     299  8.6   65     5   7    B
## 9      8      19 20.1   61     5   9    C
## 12    16     256  9.7   69     5  12    B

## 简单的方法
## 针对不同的情况和变量属性，可以使用不同的缺失值处理方法
## 1: 填补缺失值：
##    均值，中位数，众数等
## is.na()查看Ozone（臭氧）数据缺失值的位置
myair2 <- myair
## 使用均值填补缺失值
myair2$Ozone[is.na(myair$Ozone)] <- mean(myair$Ozone,na.rm = TRUE)

## 输出哪些位置有缺失值
which(is.na(myair$Solar.R))

## [1]  5  6 11 27 96 97 98

## 使用中位数填补缺失值
myair2$Solar.R[which(is.na(myair$Solar.R))] <- median(myair2$Solar.R,na.rm = TRUE)


## 使用前面的或者后面的数据填补缺失值
library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 使用前面或者后面的值来填补缺失值
myair2$Wind <- na.locf(myair$Wind)
myair2$Temp <- na.locf(myair$Temp,fromLast = TRUE)
## 数据中月份数据可以使用前面和后面数据的平均值来填补
## 找到缺失值的位置
naindex <- which(is.na(myair$Month))
newnamonth <- round((myair$Month[naindex-1] + myair$Month[naindex+1]) / 2)
myair2$Month[naindex] <- newnamonth
## 日期数据根据数据情况可以使用前面的数值＋1
naindex <- which(is.na(myair$Day))
newnaday <- myair$Day[naindex-1] + 1
myair2$Day[naindex] <- newnaday


library(Hmisc)

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: Formula

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following object is masked from 'package:imager':
## 
##     label

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## 使用众数填补缺失值Type变量
## 找出众数
table(myair$Type)

## 
##  A  B  C 
## 39 51 58

myair2$Type <- impute(myair$Type,"C")


## 观察处理后新数据集的缺失值情况
aggr(myair2)

复杂的数据缺失值处理方法

## 复杂的缺失值处理方法
colnames(myair)

## [1] "Ozone"   "Solar.R" "Wind"    "Temp"    "Month"   "Day"     "Type"

## 考虑"Ozone"   "Solar.R" "Wind"    "Temp"之间有关系对四个特征进行缺失值处理
## 提取数据
myair <- myair[,c(1:4)]

## 使用KNN方法来填补缺失值
library(DMwR2)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## 
## Attaching package: 'DMwR2'

## The following object is masked from 'package:VIM':
## 
##     kNN

myair2 <- knnImputation(myair,k=5,scale = TRUE,meth = "weighAvg")

## 使用随机森林的方式填补缺失值
library(missForest)

## Loading required package: randomForest

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:imager':
## 
##     grow

## Loading required package: foreach

## Loading required package: itertools

## Loading required package: iterators

## 
## Attaching package: 'missForest'

## The following object is masked from 'package:VIM':
## 
##     nrmse

myair2 <- missForest(myair,ntree = 50)

##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!

## 填补缺失值后的数据
myair2$ximp

##         Ozone Solar.R   Wind     Temp
## 1    41.00000 190.000  7.400 67.00000
## 2    36.00000 118.000  8.000 72.00000
## 3    12.00000 149.000 12.600 74.00000
## 4    18.00000 313.000 11.500 62.00000
## 5    19.32000 153.360 14.300 56.00000
## 6    28.00000 247.912 14.900 66.00000
## 7    23.00000 299.000  8.600 65.00000
## 8    19.00000  99.000 13.800 72.62000
## 9     8.00000  19.000 20.100 61.00000
## 10   24.74000 194.000  8.600 69.00000
## 11    7.00000  54.980  6.900 74.00000
## 12   16.00000 256.000  9.700 69.00000
## 13   11.00000 290.000  9.200 66.00000
## 14   14.00000 274.000 10.900 68.00000
## 15   18.00000  65.000 13.200 58.00000
## 16   14.00000 334.000 11.500 64.00000
## 17   34.00000 307.000 12.000 66.00000
## 18    6.00000  78.000 18.400 57.00000
## 19   30.00000 322.000 11.500 68.00000
## 20   11.00000  44.000  9.700 62.00000
## 21    1.00000   8.000  9.700 59.00000
## 22   11.00000 320.000 11.608 73.00000
## 23    4.00000  25.000  9.700 61.00000
## 24   32.00000  92.000 12.000 61.00000
## 25   12.50000  66.000 16.600 57.00000
## 26   19.10000 266.000 14.900 58.00000
## 27   26.18000 154.760  8.000 57.00000
## 28   23.00000  13.000 12.000 67.00000
## 29   45.00000 252.000 14.900 81.00000
## 30  115.00000 223.000  5.700 79.00000
## 31   37.00000 279.000  7.400 76.00000
## 32   40.57867 286.000  8.600 78.00000
## 33   22.48000 287.000  9.700 74.00000
## 34   20.20000 242.000 16.100 67.00000
## 35   50.38000 186.000  9.200 84.00000
## 36   80.66000 220.000  8.600 85.00000
## 37   34.33500 264.000 14.300 79.00000
## 38   29.00000 127.000  9.700 82.00000
## 39   75.58000 273.000  6.900 87.00000
## 40   71.00000 291.000 13.800 90.00000
## 41   39.00000 323.000 11.500 87.00000
## 42   81.64000 259.000 10.900 93.00000
## 43   84.46000 250.000  9.200 92.00000
## 44   23.00000 148.000  8.000 82.00000
## 45   35.38833 332.000 13.800 80.00000
## 46   32.57333 322.000 11.500 79.00000
## 47   21.00000 191.000 14.900 77.00000
## 48   37.00000 284.000 20.700 72.00000
## 49   20.00000  37.000  9.200 65.00000
## 50   12.00000 120.000 11.500 73.00000
## 51   13.00000 137.000 10.300 76.00000
## 52   36.14000 150.000  6.300 77.00000
## 53   71.37800  59.000  1.700 76.00000
## 54   59.72000  91.000  4.600 76.00000
## 55   37.06000 250.000  6.300 76.00000
## 56   19.44000 135.000  8.000 75.00000
## 57   33.75067 127.000  8.000 78.00000
## 58   10.24800  47.000 10.300 73.00000
## 59   32.48000  98.000 11.500 80.00000
## 60   13.02800  31.000 14.900 77.00000
## 61   36.34000 138.000  8.000 83.00000
## 62  135.00000 269.000  4.100 84.00000
## 63   49.00000 248.000  9.200 85.00000
## 64   32.00000 236.000  9.200 81.00000
## 65   38.66000 101.000 10.900 84.00000
## 66   64.00000 175.000  4.600 83.00000
## 67   40.00000 314.000 10.900 83.00000
## 68   77.00000 276.000  5.100 88.00000
## 69   97.00000 267.000  6.300 92.00000
## 70   97.00000 272.000  5.700 92.00000
## 71   85.00000 175.000  6.118 89.00000
## 72   22.32067 139.000  8.600 82.00000
## 73   10.00000 264.000 14.300 73.00000
## 74   27.00000 175.000 14.900 81.00000
## 75   25.64000 291.000 14.900 68.19567
## 76    7.00000  48.000 14.300 80.00000
## 77   48.00000 260.000  6.900 81.00000
## 78   35.00000 274.000 10.300 82.00000
## 79   61.00000 285.000  6.300 84.00000
## 80   79.00000 187.000  5.712 87.00000
## 81   63.00000 220.000 11.500 85.00000
## 82   16.00000   7.000  6.900 74.00000
## 83   49.49800 258.000  9.700 81.00000
## 84   35.23333 295.000 11.500 82.00000
## 85   80.00000 294.000  8.600 86.00000
## 86  108.00000 223.000  8.000 85.00000
## 87   20.00000  81.000  8.600 82.00000
## 88   52.00000  82.000 12.000 86.00000
## 89   82.00000 213.000  7.400 88.00000
## 90   50.00000 275.000  7.400 80.88000
## 91   64.00000 253.000  7.400 83.00000
## 92   59.00000 254.000  9.200 81.00000
## 93   39.00000  83.000  6.900 81.00000
## 94    9.00000  24.000 13.800 81.00000
## 95   16.00000  77.000  7.400 82.00000
## 96   78.00000 238.720  6.900 86.00000
## 97   35.00000 192.120  7.400 85.00000
## 98   66.00000 212.420  4.600 87.00000
## 99  122.00000 255.000  4.000 89.00000
## 100  89.00000 229.000 10.300 90.00000
## 101 110.00000 207.000  8.000 90.00000
## 102  95.90000 222.000  8.600 92.00000
## 103  47.62000 137.000 11.500 86.00000
## 104  44.00000 192.000 11.500 86.00000
## 105  28.00000 273.000 11.500 82.00000
## 106  65.00000 157.000  9.700 80.00000
## 107  27.24000  64.000 11.500 79.00000
## 108  22.00000  71.000 10.300 77.00000
## 109  59.00000  51.000  6.300 79.00000
## 110  23.00000 115.000  7.400 76.00000
## 111  31.00000 244.000 10.900 78.00000
## 112  44.00000 190.000 10.300 78.00000
## 113  21.00000 259.000 15.500 77.00000
## 114   9.00000  36.000 14.300 72.00000
## 115  17.34000 255.000 12.600 75.00000
## 116  45.00000 212.000  9.700 79.00000
## 117 168.00000 238.000  3.400 81.00000
## 118  73.00000 215.000  8.000 86.00000
## 119  76.74000 153.000  5.700 88.00000
## 120  76.00000 203.000  9.700 97.00000
## 121 118.00000 225.000  6.348 94.00000
## 122  84.00000 237.000  6.300 96.00000
## 123  85.00000 188.000  6.300 94.00000
## 124  96.00000 167.000  6.900 91.00000
## 125  78.00000 197.000  5.100 92.00000
## 126  73.00000 183.000  2.800 93.00000
## 127  91.00000 189.000  4.600 93.00000
## 128  47.00000  95.000  7.400 87.00000
## 129  32.00000  92.000 15.500 84.00000
## 130  20.00000 252.000 10.900 80.00000
## 131  23.00000 220.000 10.300 78.00000
## 132  21.00000 230.000 10.900 75.00000
## 133  24.00000 259.000  9.700 73.00000
## 134  44.00000 236.000 14.900 81.00000
## 135  21.00000 259.000 15.500 76.00000
## 136  28.00000 238.000  6.300 77.00000
## 137   9.00000  24.000 10.900 71.00000
## 138  13.00000 112.000 11.500 71.00000
## 139  46.00000 237.000  6.900 78.00000
## 140  18.00000 224.000 13.800 67.00000
## 141  13.00000  27.000 10.300 76.00000
## 142  24.00000 238.000 10.300 68.00000
## 143  16.00000 201.000  8.000 82.00000
## 144  13.00000 238.000 12.600 64.00000
## 145  23.00000  14.000  9.200 71.00000
## 146  36.00000 139.000 10.300 81.00000
## 147   7.00000  49.000 10.300 69.00000
## 148  14.00000  20.000 16.600 63.00000
## 149  30.00000 193.000  6.900 70.00000
## 150  17.76000 145.000 13.200 77.00000
## 151  14.00000 191.000 14.300 75.00000
## 152  18.00000 131.000  8.000 76.00000
## 153  20.00000 223.000 11.500 68.00000

## OOB误差
myair2$OOBerror

##     NRMSE 
## 0.5566858

## 缺失值多重插补
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:imager':
## 
##     squeeze

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

## 进行链式方程的多元插补
## m:多重插补的数量
## method : 指定插补方法
## norm.predict : 线性回归预测；pmm：均值插补方法，rf: 随机森林方法
## norm:贝叶斯线性回归
impdta <- mice(myair,m = 5,method=c("norm.predict","pmm","rf","norm"))

## 
##  iter imp variable
##   1   1  Ozone  Solar.R  Wind  Temp
##   1   2  Ozone  Solar.R  Wind  Temp
##   1   3  Ozone  Solar.R  Wind  Temp
##   1   4  Ozone  Solar.R  Wind  Temp
##   1   5  Ozone  Solar.R  Wind  Temp
##   2   1  Ozone  Solar.R  Wind  Temp
##   2   2  Ozone  Solar.R  Wind  Temp
##   2   3  Ozone  Solar.R  Wind  Temp
##   2   4  Ozone  Solar.R  Wind  Temp
##   2   5  Ozone  Solar.R  Wind  Temp
##   3   1  Ozone  Solar.R  Wind  Temp
##   3   2  Ozone  Solar.R  Wind  Temp
##   3   3  Ozone  Solar.R  Wind  Temp
##   3   4  Ozone  Solar.R  Wind  Temp
##   3   5  Ozone  Solar.R  Wind  Temp
##   4   1  Ozone  Solar.R  Wind  Temp
##   4   2  Ozone  Solar.R  Wind  Temp
##   4   3  Ozone  Solar.R  Wind  Temp
##   4   4  Ozone  Solar.R  Wind  Temp
##   4   5  Ozone  Solar.R  Wind  Temp
##   5   1  Ozone  Solar.R  Wind  Temp
##   5   2  Ozone  Solar.R  Wind  Temp
##   5   3  Ozone  Solar.R  Wind  Temp
##   5   4  Ozone  Solar.R  Wind  Temp
##   5   5  Ozone  Solar.R  Wind  Temp

summary(impdta)

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##          Ozone        Solar.R           Wind           Temp 
## "norm.predict"          "pmm"           "rf"         "norm" 
## PredictorMatrix:
##         Ozone Solar.R Wind Temp
## Ozone       0       1    1    1
## Solar.R     1       0    1    1
## Wind        1       1    0    1
## Temp        1       1    1    0