第2章: 数据管理和探索

2.1：数据获取

读取各种各样的数据到R中

## 1:读取CSV文件
## 方式1
csvdata <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(csvdata)

##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa
## 3  3           4.7          3.2           1.3          0.2 Iris-setosa
## 4  4           4.6          3.1           1.5          0.2 Iris-setosa
## 5  5           5.0          3.6           1.4          0.2 Iris-setosa
## 6  6           5.4          3.9           1.7          0.4 Iris-setosa

str(csvdata)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 方式2
csvdata <- read.table("data/chap2/Iris.csv",header = TRUE,sep = ",")
head(csvdata)

##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa
## 3  3           4.7          3.2           1.3          0.2 Iris-setosa
## 4  4           4.6          3.1           1.5          0.2 Iris-setosa
## 5  5           5.0          3.6           1.4          0.2 Iris-setosa
## 6  6           5.4          3.9           1.7          0.4 Iris-setosa

str(csvdata)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 方式3  c = character, i = integer, n = number, d = double, l = logical, D = date, ## T = date time, t = time, ? = guess
library(readr)
csvdata <- read_csv("data/chap2/Iris.csv",col_names = TRUE,
                    col_types = list("d","d","d","d","d","c"))
head(csvdata,2)

## # A tibble: 2 x 6
##      Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species    
##   <dbl>         <dbl>        <dbl>         <dbl>        <dbl> <chr>      
## 1     1           5.1          3.5           1.4          0.2 Iris-setosa
## 2     2           4.9          3             1.4          0.2 Iris-setosa

str(csvdata)

## tibble [150 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Id           : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Id = col_double(),
##   ..   SepalLengthCm = col_double(),
##   ..   SepalWidthCm = col_double(),
##   ..   PetalLengthCm = col_double(),
##   ..   PetalWidthCm = col_double(),
##   ..   Species = col_character()
##   .. )

## 数据保存为csv
write_csv(csvdata,"data/chap2/IrisWrite_1.csv")

write.csv(csvdata,"data/chap2/IrisWrite_2.csv",quote = FALSE)

## 2: 读取excel数据
library(readxl)
exceldata <- read_excel("data/chap2/Iris.xlsx",sheet = "Iris")
str(exceldata,2)

## tibble [150 × 6] (S3: tbl_df/tbl/data.frame)
##  $ Id           : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 读取spss数据
library(foreign)
spssdata <- read.spss("data/chap2/Iris_spss.sav",to.data.frame = TRUE)
head(spssdata,2)

##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm         Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa    
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa

str(spssdata)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr  "Iris-setosa    " "Iris-setosa    " "Iris-setosa    " "Iris-setosa    " ...
##  - attr(*, "codepage")= int 65001

## 读取spss数据
library(haven)
spssdata <- read_sav("data/chap2/Iris_spss.sav")
head(spssdata,2)

## # A tibble: 2 x 6
##      Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species    
##   <dbl>         <dbl>        <dbl>         <dbl>        <dbl> <chr>      
## 1     1           5.1          3.5           1.4          0.2 Iris-setosa
## 2     2           4.9          3             1.4          0.2 Iris-setosa

str(spssdata)

## tibble [150 × 6] (S3: tbl_df/tbl/data.frame)
##  $ Id           : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
##   ..- attr(*, "format.spss")= chr "F11.0"
##   ..- attr(*, "display_width")= int 11
##  $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##   ..- attr(*, "format.spss")= chr "F11.1"
##   ..- attr(*, "display_width")= int 11
##  $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##   ..- attr(*, "format.spss")= chr "F11.1"
##   ..- attr(*, "display_width")= int 11
##  $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##   ..- attr(*, "format.spss")= chr "F11.1"
##   ..- attr(*, "display_width")= int 11
##  $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##   ..- attr(*, "format.spss")= chr "F11.1"
##   ..- attr(*, "display_width")= int 11
##  $ Species      : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##   ..- attr(*, "format.spss")= chr "A15"
##   ..- attr(*, "display_width")= int 15

## 方法2 
spssdata <- read_spss("data/chap2/Iris_spss.sav")
head(spssdata,2)

## # A tibble: 2 x 6
##      Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species    
##   <dbl>         <dbl>        <dbl>         <dbl>        <dbl> <chr>      
## 1     1           5.1          3.5           1.4          0.2 Iris-setosa
## 2     2           4.9          3             1.4          0.2 Iris-setosa

## 读取SAS数据
sasdata <- read_sas("data/chap2/iris.sas7bdat")
head(sasdata,2)

## # A tibble: 2 x 5
##   Sepal_Length Sepal_Width Petal_Length Petal_Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <chr>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa

## 读取stata数据
dtadata <- read_dta("data/chap2/iris.dta")
head(dtadata,2)

## # A tibble: 2 x 5
##   sepallength sepalwidth petallength petalwidth species
##         <dbl>      <dbl>       <dbl>      <dbl> <chr>  
## 1        5.10        3.5        1.40      0.200 setosa 
## 2        4.90        3          1.40      0.200 setosa

str(dtadata)

## tibble [150 × 5] (S3: tbl_df/tbl/data.frame)
##  $ sepallength: num [1:150] 5.1 4.9 4.7 4.6 5 ...
##   ..- attr(*, "label")= chr "Sepal.Length"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ sepalwidth : num [1:150] 3.5 3 3.2 3.1 3.6 ...
##   ..- attr(*, "label")= chr "Sepal.Width"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ petallength: num [1:150] 1.4 1.4 1.3 1.5 1.4 ...
##   ..- attr(*, "label")= chr "Petal.Length"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ petalwidth : num [1:150] 0.2 0.2 0.2 0.2 0.2 ...
##   ..- attr(*, "label")= chr "Petal.Width"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ species    : chr [1:150] "setosa" "setosa" "setosa" "setosa" ...
##   ..- attr(*, "label")= chr "Species"
##   ..- attr(*, "format.stata")= chr "%10s"

dtadata <- read_stata("data/chap2/iris.dta")
head(dtadata,2)

## # A tibble: 2 x 5
##   sepallength sepalwidth petallength petalwidth species
##         <dbl>      <dbl>       <dbl>      <dbl> <chr>  
## 1        5.10        3.5        1.40      0.200 setosa 
## 2        4.90        3          1.40      0.200 setosa

str(dtadata)

## tibble [150 × 5] (S3: tbl_df/tbl/data.frame)
##  $ sepallength: num [1:150] 5.1 4.9 4.7 4.6 5 ...
##   ..- attr(*, "label")= chr "Sepal.Length"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ sepalwidth : num [1:150] 3.5 3 3.2 3.1 3.6 ...
##   ..- attr(*, "label")= chr "Sepal.Width"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ petallength: num [1:150] 1.4 1.4 1.3 1.5 1.4 ...
##   ..- attr(*, "label")= chr "Petal.Length"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ petalwidth : num [1:150] 0.2 0.2 0.2 0.2 0.2 ...
##   ..- attr(*, "label")= chr "Petal.Width"
##   ..- attr(*, "format.stata")= chr "%9.0g"
##  $ species    : chr [1:150] "setosa" "setosa" "setosa" "setosa" ...
##   ..- attr(*, "label")= chr "Species"
##   ..- attr(*, "format.stata")= chr "%10s"

## 读取matlab数据文件
library(R.matlab)

## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.

## 
## Attaching package: 'R.matlab'

## The following objects are masked from 'package:base':
## 
##     getOption, isOpen

matdata <- readMat("data/chap2/ABC.mat")
str(matdata)

## List of 3
##  $ A: int [1:9, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
##  $ B: int [1:10, 1] 1 2 3 4 5 6 7 8 9 10
##  $ C: int [1:2, 1:3, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, "header")=List of 3
##   ..$ description: chr "MATLAB 5.0 MAT-file, Platform: windows, Software: R v2.15.0, Created on: Sat Mar 31 19:50:00 2012                           "
##   ..$ version    : chr "5"
##   ..$ endian     : chr "little"

head(matdata$A,2)

##      [,1] [,2] [,3]
## [1,]    1   10   19
## [2,]    2   11   20

## 读取图片数据
## 读取png图像
library(png)
impng <- readPNG("data/chap2/Rlogo.png")
r <- nrow(impng) / ncol(impng) # image ratio
plot(c(0,1), c(0,r), type = "n", xlab = "", ylab = "", asp=1)
## 该行在Nootbook中不支持，但是在Console中运行正常
rasterImage(impng, 0, 0, 1, r)

str(impng)

##  num [1:76, 1:100, 1:4] 0 0 0 0 0 0 0 0 0 0 ...

## load.image 可以读取多种格式的图像
library(imager)

## Loading required package: magrittr

## 
## Attaching package: 'imager'

## The following object is masked from 'package:magrittr':
## 
##     add

## The following objects are masked from 'package:stats':
## 
##     convolve, spectrum

## The following object is masked from 'package:graphics':
## 
##     frame

## The following object is masked from 'package:base':
## 
##     save.image

imjpg <- load.image("data/chap2/image.jpg")
imdim <- dim(imjpg)
plot(imjpg,xlim = c(1,width(imjpg)),ylim = c(1,height(imjpg)))

通过爬虫获取数据

从HTML中获取链接、表格

library(XML)

## 获取网页中的链接,检查R官网都有哪些链接
fileURL <- "https://www.r-project.org/"
fileURLnew <- sub("https", "http", fileURL)
links <- getHTMLLinks(fileURLnew)
length(links)

## [1] 38

## 从网页中读取数据表格，公牛队球员的数据
fileURL <- "http://www.stat-nba.com/team/CHI.html"
Tab <- readHTMLTable(fileURL)
length(Tab)

## [1] 2

NBAmember <- Tab[[1]]
head(NBAmember)

##             球员 出场 首发 时间  投篮 命中 出手  三分 命中 出手  罚球 命中 出手
## 1      扎克-拉文   60   60 34.8 44.9%  9.0 20.0 38.0%  3.1  8.1 80.2%  4.5  5.6
## 2  劳里-马尔卡宁   50   50 29.8 42.5%  5.0 11.8 34.4%  2.2  6.3 82.4%  2.5  3.1
## 3      科比-怀特   65    1 25.9 39.4%  4.8 12.2 35.4%  2.0  5.8 79.1%  1.6  2.0
## 4      奥托-波特   14    9 23.5 44.3%  4.4 10.0 38.7%  1.7  4.4 70.4%  1.4  1.9
## 5    温德尔-卡特   43   43 29.2 53.4%  4.3  8.0 20.7%  0.1  0.7 73.7%  2.6  3.5
## 6      赛迪斯-杨   64   16 24.9 44.7%  4.2  9.4 35.4%  1.2  3.5 58.3%  0.7  1.1
##   篮板 前场 后场 助攻 抢断 盖帽 失误 犯规 得分
## 1  4.8  0.7  4.1  4.2  1.5  0.5  3.4  2.2 25.5
## 2  6.3  1.2  5.1  1.5  0.8  0.5  1.6  1.9 14.7
## 3  3.6  0.4  3.1  2.7  0.8  0.1  1.7  1.8 13.2
## 4  3.4  0.9  2.5  1.8  1.1  0.4  0.8  2.2 11.9
## 5  9.4  3.2  6.2  1.2  0.8  0.8  1.7  3.8 11.3
## 6  4.9  1.5  3.5  1.8  1.4  0.4  1.6  2.1 10.3

## 使用rvest包获取网络数据
library(rvest)

## Loading required package: xml2

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:XML':
## 
##     xml

## The following object is masked from 'package:readr':
## 
##     guess_encoding

library(stringr)

## 
## Attaching package: 'stringr'

## The following object is masked from 'package:imager':
## 
##     boundary

## 读取网页，获取电影的名称
top250 <- read_html("https://movie.douban.com/top250")
title <-top250 %>% html_nodes("span.title") %>% html_text()
head(title)

## [1] "肖申克的救赎"                " / The Shawshank Redemption"
## [3] "霸王别姬"                    "阿甘正传"                   
## [5] " / Forrest Gump"             "这个杀手不太冷"

## 获取第一个名字
title <- title[is.na(str_match(title,"/"))]
head(title)

## [1] "肖申克的救赎"   "霸王别姬"       "阿甘正传"       "这个杀手不太冷"
## [5] "泰坦尼克号"     "美丽人生"

## 获取电影的评分
score <-top250 %>% html_nodes("span.rating_num") %>% html_text()
filmdf <- data.frame(title = title,score = as.numeric(score))

## 获取电影的主题
term <-top250 %>% html_nodes("span.inq") %>% html_text()
filmdf$term <- term
head(filmdf)

##            title score                           term
## 1   肖申克的救赎   9.7                 希望让人自由。
## 2       霸王别姬   9.6                     风华绝代。
## 3       阿甘正传   9.5             一部美国近现代史。
## 4 这个杀手不太冷   9.4 怪蜀黍和小萝莉不得不说的故事。
## 5     泰坦尼克号   9.4            失去的才是永恒的。 
## 6       美丽人生   9.6                   最美的谎言。

2.2：数据缺失值处理

很多时候数据不会是完整的，会存在有缺失值的情况，这时需要对缺失的数据进行处理。

##读取数据
myair <- read.csv("data/chap2/myairquality.csv")
dim(myair)

## [1] 153   7

summary(myair)

##      Ozone           Solar.R           Wind            Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.70   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.40   1st Qu.:72.25  
##  Median : 31.50   Median :205.0   Median : 9.70   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   :10.01   Mean   :77.87  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.50   3rd Qu.:84.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.70   Max.   :97.00  
##  NA's   :37       NA's   :7       NA's   :4       NA's   :3      
##      Month            Day            Type          
##  Min.   :5.000   Min.   : 1.00   Length:153        
##  1st Qu.:6.000   1st Qu.: 8.00   Class :character  
##  Median :7.000   Median :16.00   Mode  :character  
##  Mean   :6.993   Mean   :15.71                     
##  3rd Qu.:8.000   3rd Qu.:23.00                     
##  Max.   :9.000   Max.   :31.00                     
##  NA's   :3       NA's   :6

## 1:检查数据是否存在缺失值
library(VIM)

## Loading required package: colorspace

## Loading required package: grid

## 
## Attaching package: 'grid'

## The following object is masked from 'package:imager':
## 
##     depth

## VIM is ready to use.

## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

## 可视化查看数据是否有缺失值
aggr(myair)

## complete.cases()输出样例是否包含缺失值
## 输出包含缺失值的样例
mynadata <- myair[!complete.cases(myair),]
dim(mynadata)

## [1] 57  7

head(mynadata)

##    Ozone Solar.R Wind Temp Month Day Type
## 2     36     118  8.0   72     5   2 <NA>
## 5     NA      NA 14.3   56     5   5    C
## 6     28      NA 14.9   66     5   6    B
## 8     19      99 13.8   NA     5   8    A
## 10    NA     194  8.6   69     5  10    C
## 11     7      NA  6.9   74     5  11    B

## matrixplot()可视化缺失值的详细情况
## 红色代表缺失数据的情况
matrixplot(mynadata)

## 
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.

## 只保留没有缺失值的样例
newdata <- na.omit(myair)
dim(newdata)

## [1] 96  7

head(newdata)

##    Ozone Solar.R Wind Temp Month Day Type
## 1     41     190  7.4   67     5   1    A
## 3     12     149 12.6   74     5   3    A
## 4     18     313 11.5   62     5   4    C
## 7     23     299  8.6   65     5   7    B
## 9      8      19 20.1   61     5   9    C
## 12    16     256  9.7   69     5  12    B

## 简单的方法
## 针对不同的情况和变量属性，可以使用不同的缺失值处理方法
## 1: 填补缺失值：
##    均值，中位数，众数等
## is.na()查看Ozone（臭氧）数据缺失值的位置
myair2 <- myair
## 使用均值填补缺失值
myair2$Ozone[is.na(myair$Ozone)] <- mean(myair$Ozone,na.rm = TRUE)

## 输出哪些位置有缺失值
which(is.na(myair$Solar.R))

## [1]  5  6 11 27 96 97 98

## 使用中位数填补缺失值
myair2$Solar.R[which(is.na(myair$Solar.R))] <- median(myair2$Solar.R,na.rm = TRUE)


## 使用前面的或者后面的数据填补缺失值
library(zoo)

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 使用前面或者后面的值来填补缺失值
myair2$Wind <- na.locf(myair$Wind)
myair2$Temp <- na.locf(myair$Temp,fromLast = TRUE)
## 数据中月份数据可以使用前面和后面数据的平均值来填补
## 找到缺失值的位置
naindex <- which(is.na(myair$Month))
newnamonth <- round((myair$Month[naindex-1] + myair$Month[naindex+1]) / 2)
myair2$Month[naindex] <- newnamonth
## 日期数据根据数据情况可以使用前面的数值＋1
naindex <- which(is.na(myair$Day))
newnaday <- myair$Day[naindex-1] + 1
myair2$Day[naindex] <- newnaday


library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following object is masked from 'package:rvest':
## 
##     html

## The following object is masked from 'package:imager':
## 
##     label

## The following objects are masked from 'package:base':
## 
##     format.pval, units

## 使用众数填补缺失值Type变量
## 找出众数
table(myair$Type)

## 
##  A  B  C 
## 39 51 58

myair2$Type <- impute(myair$Type,"C")


## 观察处理后新数据集的缺失值情况
aggr(myair2)

复杂的数据缺失值处理方法

## 复杂的缺失值处理方法
colnames(myair)

## [1] "Ozone"   "Solar.R" "Wind"    "Temp"    "Month"   "Day"     "Type"

## 考虑"Ozone"   "Solar.R" "Wind"    "Temp"之间有关系对四个特征进行缺失值处理
## 提取数据
myair <- myair[,c(1:4)]

## 使用KNN方法来填补缺失值
library(DMwR2)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## 
## Attaching package: 'DMwR2'

## The following object is masked from 'package:VIM':
## 
##     kNN

myair2 <- knnImputation(myair,k=5,scale = TRUE,meth = "weighAvg")

## 使用随机森林的方式填补缺失值
library(missForest)

## Loading required package: randomForest

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:imager':
## 
##     grow

## Loading required package: foreach

## Loading required package: itertools

## Loading required package: iterators

## 
## Attaching package: 'missForest'

## The following object is masked from 'package:VIM':
## 
##     nrmse

myair2 <- missForest(myair,ntree = 50)

##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!
##   missForest iteration 4 in progress...done!

## 填补缺失值后的数据
myair2$ximp

##         Ozone Solar.R   Wind     Temp
## 1    41.00000  190.00  7.400 67.00000
## 2    36.00000  118.00  8.000 72.00000
## 3    12.00000  149.00 12.600 74.00000
## 4    18.00000  313.00 11.500 62.00000
## 5    20.12000  143.44 14.300 56.00000
## 6    28.00000  232.80 14.900 66.00000
## 7    23.00000  299.00  8.600 65.00000
## 8    19.00000   99.00 13.800 70.50000
## 9     8.00000   19.00 20.100 61.00000
## 10   26.60000  194.00  8.600 69.00000
## 11    7.00000   53.22  6.900 74.00000
## 12   16.00000  256.00  9.700 69.00000
## 13   11.00000  290.00  9.200 66.00000
## 14   14.00000  274.00 10.900 68.00000
## 15   18.00000   65.00 13.200 58.00000
## 16   14.00000  334.00 11.500 64.00000
## 17   34.00000  307.00 12.000 66.00000
## 18    6.00000   78.00 18.400 57.00000
## 19   30.00000  322.00 11.500 68.00000
## 20   11.00000   44.00  9.700 62.00000
## 21    1.00000    8.00  9.700 59.00000
## 22   11.00000  320.00 12.578 73.00000
## 23    4.00000   25.00  9.700 61.00000
## 24   32.00000   92.00 12.000 61.00000
## 25   11.50000   66.00 16.600 57.00000
## 26   20.62000  266.00 14.900 58.00000
## 27   29.04000  154.80  8.000 57.00000
## 28   23.00000   13.00 12.000 67.00000
## 29   45.00000  252.00 14.900 81.00000
## 30  115.00000  223.00  5.700 79.00000
## 31   37.00000  279.00  7.400 76.00000
## 32   35.30000  286.00  8.600 78.00000
## 33   20.18000  287.00  9.700 74.00000
## 34   25.04000  242.00 16.100 67.00000
## 35   48.98800  186.00  9.200 84.00000
## 36   81.14000  220.00  8.600 85.00000
## 37   31.62833  264.00 14.300 79.00000
## 38   29.00000  127.00  9.700 82.00000
## 39   71.10000  273.00  6.900 87.00000
## 40   71.00000  291.00 13.800 90.00000
## 41   39.00000  323.00 11.500 87.00000
## 42   81.18000  259.00 10.900 93.00000
## 43   77.74000  250.00  9.200 92.00000
## 44   23.00000  148.00  8.000 82.00000
## 45   33.00833  332.00 13.800 80.00000
## 46   34.10000  322.00 11.500 79.00000
## 47   21.00000  191.00 14.900 77.00000
## 48   37.00000  284.00 20.700 72.00000
## 49   20.00000   37.00  9.200 65.00000
## 50   12.00000  120.00 11.500 73.00000
## 51   13.00000  137.00 10.300 76.00000
## 52   37.78000  150.00  6.300 77.00000
## 53   87.66000   59.00  1.700 76.00000
## 54   57.18000   91.00  4.600 76.00000
## 55   46.18000  250.00  6.300 76.00000
## 56   20.77333  135.00  8.000 75.00000
## 57   36.43867  127.00  8.000 78.00000
## 58   10.12000   47.00 10.300 73.00000
## 59   35.40000   98.00 11.500 80.00000
## 60   12.72000   31.00 14.900 77.00000
## 61   43.70000  138.00  8.000 83.00000
## 62  135.00000  269.00  4.100 84.00000
## 63   49.00000  248.00  9.200 85.00000
## 64   32.00000  236.00  9.200 81.00000
## 65   40.84800  101.00 10.900 84.00000
## 66   64.00000  175.00  4.600 83.00000
## 67   40.00000  314.00 10.900 83.00000
## 68   77.00000  276.00  5.100 88.00000
## 69   97.00000  267.00  6.300 92.00000
## 70   97.00000  272.00  5.700 92.00000
## 71   85.00000  175.00  6.092 89.00000
## 72   24.41800  139.00  8.600 82.00000
## 73   10.00000  264.00 14.300 73.00000
## 74   27.00000  175.00 14.900 81.00000
## 75   20.46000  291.00 14.900 70.41633
## 76    7.00000   48.00 14.300 80.00000
## 77   48.00000  260.00  6.900 81.00000
## 78   35.00000  274.00 10.300 82.00000
## 79   61.00000  285.00  6.300 84.00000
## 80   79.00000  187.00  6.336 87.00000
## 81   63.00000  220.00 11.500 85.00000
## 82   16.00000    7.00  6.900 74.00000
## 83   45.22000  258.00  9.700 81.00000
## 84   31.62000  295.00 11.500 82.00000
## 85   80.00000  294.00  8.600 86.00000
## 86  108.00000  223.00  8.000 85.00000
## 87   20.00000   81.00  8.600 82.00000
## 88   52.00000   82.00 12.000 86.00000
## 89   82.00000  213.00  7.400 88.00000
## 90   50.00000  275.00  7.400 82.40000
## 91   64.00000  253.00  7.400 83.00000
## 92   59.00000  254.00  9.200 81.00000
## 93   39.00000   83.00  6.900 81.00000
## 94    9.00000   24.00 13.800 81.00000
## 95   16.00000   77.00  7.400 82.00000
## 96   78.00000  244.72  6.900 86.00000
## 97   35.00000  183.11  7.400 85.00000
## 98   66.00000  208.20  4.600 87.00000
## 99  122.00000  255.00  4.000 89.00000
## 100  89.00000  229.00 10.300 90.00000
## 101 110.00000  207.00  8.000 90.00000
## 102  95.10000  222.00  8.600 92.00000
## 103  48.70000  137.00 11.500 86.00000
## 104  44.00000  192.00 11.500 86.00000
## 105  28.00000  273.00 11.500 82.00000
## 106  65.00000  157.00  9.700 80.00000
## 107  25.70000   64.00 11.500 79.00000
## 108  22.00000   71.00 10.300 77.00000
## 109  59.00000   51.00  6.300 79.00000
## 110  23.00000  115.00  7.400 76.00000
## 111  31.00000  244.00 10.900 78.00000
## 112  44.00000  190.00 10.300 78.00000
## 113  21.00000  259.00 15.500 77.00000
## 114   9.00000   36.00 14.300 72.00000
## 115  17.28000  255.00 12.600 75.00000
## 116  45.00000  212.00  9.700 79.00000
## 117 168.00000  238.00  3.400 81.00000
## 118  73.00000  215.00  8.000 86.00000
## 119  78.44000  153.00  5.700 88.00000
## 120  76.00000  203.00  9.700 97.00000
## 121 118.00000  225.00  6.350 94.00000
## 122  84.00000  237.00  6.300 96.00000
## 123  85.00000  188.00  6.300 94.00000
## 124  96.00000  167.00  6.900 91.00000
## 125  78.00000  197.00  5.100 92.00000
## 126  73.00000  183.00  2.800 93.00000
## 127  91.00000  189.00  4.600 93.00000
## 128  47.00000   95.00  7.400 87.00000
## 129  32.00000   92.00 15.500 84.00000
## 130  20.00000  252.00 10.900 80.00000
## 131  23.00000  220.00 10.300 78.00000
## 132  21.00000  230.00 10.900 75.00000
## 133  24.00000  259.00  9.700 73.00000
## 134  44.00000  236.00 14.900 81.00000
## 135  21.00000  259.00 15.500 76.00000
## 136  28.00000  238.00  6.300 77.00000
## 137   9.00000   24.00 10.900 71.00000
## 138  13.00000  112.00 11.500 71.00000
## 139  46.00000  237.00  6.900 78.00000
## 140  18.00000  224.00 13.800 67.00000
## 141  13.00000   27.00 10.300 76.00000
## 142  24.00000  238.00 10.300 68.00000
## 143  16.00000  201.00  8.000 82.00000
## 144  13.00000  238.00 12.600 64.00000
## 145  23.00000   14.00  9.200 71.00000
## 146  36.00000  139.00 10.300 81.00000
## 147   7.00000   49.00 10.300 69.00000
## 148  14.00000   20.00 16.600 63.00000
## 149  30.00000  193.00  6.900 70.00000
## 150  16.68000  145.00 13.200 77.00000
## 151  14.00000  191.00 14.300 75.00000
## 152  18.00000  131.00  8.000 76.00000
## 153  20.00000  223.00 11.500 68.00000

## OOB误差
myair2$OOBerror

##     NRMSE 
## 0.5792485

## 缺失值多重插补
library(mice)

## 
## Attaching package: 'mice'

## The following object is masked from 'package:imager':
## 
##     squeeze

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

## 进行链式方程的多元插补
## m:多重插补的数量
## method : 指定插补方法
## norm.predict : 线性回归预测；pmm：均值插补方法，rf: 随机森林方法
## norm:贝叶斯线性回归
impdta <- mice(myair,m = 5,method=c("norm.predict","pmm","rf","norm"))

## 
##  iter imp variable
##   1   1  Ozone  Solar.R  Wind  Temp
##   1   2  Ozone  Solar.R  Wind  Temp
##   1   3  Ozone  Solar.R  Wind  Temp
##   1   4  Ozone  Solar.R  Wind  Temp
##   1   5  Ozone  Solar.R  Wind  Temp
##   2   1  Ozone  Solar.R  Wind  Temp
##   2   2  Ozone  Solar.R  Wind  Temp
##   2   3  Ozone  Solar.R  Wind  Temp
##   2   4  Ozone  Solar.R  Wind  Temp
##   2   5  Ozone  Solar.R  Wind  Temp
##   3   1  Ozone  Solar.R  Wind  Temp
##   3   2  Ozone  Solar.R  Wind  Temp
##   3   3  Ozone  Solar.R  Wind  Temp
##   3   4  Ozone  Solar.R  Wind  Temp
##   3   5  Ozone  Solar.R  Wind  Temp
##   4   1  Ozone  Solar.R  Wind  Temp
##   4   2  Ozone  Solar.R  Wind  Temp
##   4   3  Ozone  Solar.R  Wind  Temp
##   4   4  Ozone  Solar.R  Wind  Temp
##   4   5  Ozone  Solar.R  Wind  Temp
##   5   1  Ozone  Solar.R  Wind  Temp
##   5   2  Ozone  Solar.R  Wind  Temp
##   5   3  Ozone  Solar.R  Wind  Temp
##   5   4  Ozone  Solar.R  Wind  Temp
##   5   5  Ozone  Solar.R  Wind  Temp

summary(impdta)

## Class: mids
## Number of multiple imputations:  5 
## Imputation methods:
##          Ozone        Solar.R           Wind           Temp 
## "norm.predict"          "pmm"           "rf"         "norm" 
## PredictorMatrix:
##         Ozone Solar.R Wind Temp
## Ozone       0       1    1    1
## Solar.R     1       0    1    1
## Wind        1       1    0    1
## Temp        1       1    1    0

2.3 数据操作

长宽数据变换,数据标准化处理,数据集切分

长宽数据变换

### 长宽数据变换
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:imager':
## 
##     fill

## The following object is masked from 'package:magrittr':
## 
##     extract

Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(Iris,2)

##   Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1  1           5.1          3.5           1.4          0.2 Iris-setosa
## 2  2           4.9          3.0           1.4          0.2 Iris-setosa

str(Iris)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...

## 宽数据转化为长数据1
Irislong = gather(Iris,key="varname",value="value",SepalLengthCm:PetalWidthCm)
head(Irislong,2)

##   Id     Species       varname value
## 1  1 Iris-setosa SepalLengthCm   5.1
## 2  2 Iris-setosa SepalLengthCm   4.9

str(Irislong)

## 'data.frame':    600 obs. of  4 variables:
##  $ Id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Species: chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  $ varname: chr  "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" ...
##  $ value  : num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...

## 长数据转化为宽数据1
IrisWidth <- spread(Irislong,key="varname",value="value")
head(IrisWidth,2)

##   Id     Species PetalLengthCm PetalWidthCm SepalLengthCm SepalWidthCm
## 1  1 Iris-setosa           1.4          0.2           5.1          3.5
## 2  2 Iris-setosa           1.4          0.2           4.9          3.0

str(IrisWidth)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

## 宽数据转化为长数据2
Irislong = melt(Iris,id = c("Id","Species"),variable.name = "varname",
                value.name="value")
head(Irislong,2)

##   Id     Species       varname value
## 1  1 Iris-setosa SepalLengthCm   5.1
## 2  2 Iris-setosa SepalLengthCm   4.9

str(Irislong)

## 'data.frame':    600 obs. of  4 variables:
##  $ Id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Species: chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  $ varname: Factor w/ 4 levels "SepalLengthCm",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value  : num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...

## 长数据转化为宽数据2
IrisWidth <- dcast(Irislong,Id+Species~varname)
head(IrisWidth,2)

##   Id     Species SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1  1 Iris-setosa           5.1          3.5           1.4          0.2
## 2  2 Iris-setosa           4.9          3.0           1.4          0.2

str(IrisWidth)

## 'data.frame':    150 obs. of  6 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Species      : chr  "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...

数据汇总

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Irisgroup <- Iris%>%
  ## 根据一个或多个变量分组
  group_by(Species)%>%
  ## 将多个值减少到单个值
  summarise(meanSL = mean(SepalLengthCm),
            medianSW = median(SepalWidthCm),
            sdPL = sd(PetalLengthCm),
            IQRPW = IQR(PetalWidthCm),
            num = n()) %>%
  ## 按变量排列行
  arrange(desc(sdPL))%>%
  ## 返回具有匹配条件的行
  filter(num==50)%>%
  ## 添加新的变量
  mutate(varPL = sdPL^2)

## `summarise()` ungrouping output (override with `.groups` argument)

Irisgroup

## # A tibble: 3 x 7
##   Species         meanSL medianSW  sdPL IQRPW   num  varPL
##   <chr>            <dbl>    <dbl> <dbl> <dbl> <int>  <dbl>
## 1 Iris-virginica    6.59      3   0.552 0.500    50 0.305 
## 2 Iris-versicolor   5.94      2.8 0.470 0.3      50 0.221 
## 3 Iris-setosa       5.01      3.4 0.174 0.100    50 0.0301

数据标准化

Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:5]
head(Iris,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1           5.1          3.5           1.4          0.2
## 2           4.9          3.0           1.4          0.2

str(Iris)

## 'data.frame':    150 obs. of  4 variables:
##  $ SepalLengthCm: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ SepalWidthCm : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ PetalLengthCm: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ PetalWidthCm : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...

## 数据中心化：是指变量减去它的均值；
Irisc <- scale(Iris,center = TRUE, scale = FALSE)
apply(Irisc,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.543333       -1.054     -2.758667    -1.098667
## [2,]      2.056667        1.346      3.141333     1.301333

## 数据标准化：是指数值减去均值，再除以标准差；
## 数据标准化处理
Iriss <- scale(Iris,center = TRUE, scale = TRUE)
apply(Iriss,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.863780    -2.430844     -1.563497    -1.439627
## [2,]      2.483699     3.104284      1.780377     1.705189

## min-max标准化方法是对原始数据进行线性变换。
## 设minA和maxA分别为属性A的最小值和最大值，
## 将A的一个原始值x通过min-max标准化映射成在区间[0,1]中的值
## 新数据=（原数据-最小值）/（最大值-最小值）
minmax <- function(x){
  x <- (x-min(x))/(max(x)-min(x))
}

Iris01 <- apply(Iris,2,minmax)
apply(Iris01,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]             0            0             0            0
## [2,]             1            1             1            1

## 使用caret包进行处理
library(caret)

## 
## Attaching package: 'caret'

## The following object is masked from 'package:survival':
## 
##     cluster

## preProcess得到的结果可以使用predict函数作用于新的数据集
## 而且还包括其他方法，如标准化 "scale", "range", 等
## 1 中心化
center <- preProcess(Iris,method = "center")
Irisc <- predict(center,Iris)
head(Irisc,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1    -0.7433333        0.446     -2.358667   -0.9986667
## 2    -0.9433333       -0.054     -2.358667   -0.9986667

apply(Irisc,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.543333       -1.054     -2.758667    -1.098667
## [2,]      2.056667        1.346      3.141333     1.301333

## 2 标准化
scal <- preProcess(Iris,method = c("center","scale"))
Iriss <- predict(scal,Iris)
head(Iriss,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1    -0.8976739    1.0286113     -1.336794    -1.308593
## 2    -1.1392005   -0.1245404     -1.336794    -1.308593

apply(Iriss,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]     -1.863780    -2.430844     -1.563497    -1.439627
## [2,]      2.483699     3.104284      1.780377     1.705189

## [0-1]化
minmax01 <- preProcess(Iris,method = "range",rangeBounds = c(0,1))
Iris01 <- predict(minmax01,Iris)
apply(Iris01,2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]             0            0             0            0
## [2,]             1            1             1            1

## 数据集切分
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:6]
head(Iris,2)

##   SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm     Species
## 1           5.1          3.5           1.4          0.2 Iris-setosa
## 2           4.9          3.0           1.4          0.2 Iris-setosa

## 数据集切分 1
num <- round(nrow(Iris)*0.7)
index <- sample(nrow(Iris),size = num)
Iris_train <- Iris[index,]
Iris_test <- Iris[-index,]
dim(Iris_train)

## [1] 105   5

dim(Iris_test)

## [1] 45  5

## 数据集切分2 使用carte包中的函数
## carte包中切分数据的输出为训练数据集在所有数据中的行位置
## 使用createDataPartition获取数据切分的索引
index = createDataPartition(Iris$Species,p=0.7)
Iris_train <- Iris[index$Resample1,]
Iris_test <- Iris[-index$Resample1,]
dim(Iris_train)

## [1] 105   5

dim(Iris_test)

## [1] 45  5

## 获取数据k折的行位置
index2 <- createFolds(Iris$Species,k = 3)
index2

## $Fold1
##  [1]   3   7  11  12  15  19  21  23  27  29  30  37  38  40  41  45  49  55  56
## [20]  57  64  65  69  70  71  73  74  75  79  85  86  87  94  99 105 106 111 114
## [39] 116 122 123 125 128 129 133 134 137 139 140 142
## 
## $Fold2
##  [1]   2   9  10  16  17  18  20  25  28  31  32  33  34  36  43  46  50  51  53
## [20]  59  61  66  67  68  76  82  84  91  93  95  96  97  98 101 103 107 112 113
## [39] 115 117 118 119 124 131 132 138 141 143 144 148
## 
## $Fold3
##  [1]   1   4   5   6   8  13  14  22  24  26  35  39  42  44  47  48  52  54  58
## [20]  60  62  63  72  77  78  80  81  83  88  89  90  92 100 102 104 108 109 110
## [39] 120 121 126 127 130 135 136 145 146 147 149 150

2.4：数据描述

集中趋势，离散程度、偏度和峰度

iris <- read.csv("data/chap2/Iris.csv")

## 数据的集中趋势
## 均值
apply(iris[,c(2:5)],2,mean)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##      5.843333      3.054000      3.758667      1.198667

## 中位数
apply(iris[,c(2:5)],2,median)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##          5.80          3.00          4.35          1.30

## 离散程度
## 方差
apply(iris[,c(2:5)],2,var)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.6856935     0.1880040     3.1131794     0.5824143

## 标准差
apply(iris[,c(2:5)],2,sd)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.8280661     0.4335943     1.7644204     0.7631607

## 中位数绝对偏差
apply(iris[,c(2:5)],2,mad)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##       1.03782       0.37065       1.85325       1.03782

## 变异系数 标准差／均值,越大说明数据越分散
apply(iris[,c(2:5)],2,sd) / apply(iris[,c(2:5)],2,mean)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.1417113     0.1419759     0.4694272     0.6366747

## 四分位数 和 极值
apply(iris[,c(2:5)],2,quantile)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0%             4.3          2.0          1.00          0.1
## 25%            5.1          2.8          1.60          0.3
## 50%            5.8          3.0          4.35          1.3
## 75%            6.4          3.3          5.10          1.8
## 100%           7.9          4.4          6.90          2.5

apply(iris[,c(2:5)],2,fivenum)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]           4.3          2.0          1.00          0.1
## [2,]           5.1          2.8          1.60          0.3
## [3,]           5.8          3.0          4.35          1.3
## [4,]           6.4          3.3          5.10          1.8
## [5,]           7.9          4.4          6.90          2.5

apply(iris[,c(2:5)],2,range)

##      SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,]           4.3          2.0           1.0          0.1
## [2,]           7.9          4.4           6.9          2.5

## 四分位数范围 IQR(x) = quantile(x, 3/4) - quantile(x, 1/4).
apply(iris[,c(2:5)],2,IQR)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##           1.3           0.5           3.5           1.5

## 偏度和峰度,可以使用moments库
library(moments)
apply(iris[,c(2:5)],2,skewness)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##     0.3117531     0.3307028    -0.2717120    -0.1039437

apply(iris[,c(2:5)],2,kurtosis)

## SepalLengthCm  SepalWidthCm PetalLengthCm  PetalWidthCm 
##      2.426432      3.241443      1.604641      1.664754

library(ggplot2)
library(tidyr)
## 宽数据转化为长数据
irislong = gather(iris[,c(2:5)],key="varname",
                  value="value",SepalLengthCm:PetalWidthCm)
## 可视化数据的分布
ggplot(irislong,aes(colour = varname,linetype = varname))+
  theme_bw()+geom_density(aes(value),bw = 0.5)

## 可视化数据的分布
ggplot(irislong,aes(colour = varname,fill = varname,linetype = varname))+
  theme_bw()+geom_density(aes(value),bw = 0.5,alpha = 0.4)

plot(density(iris$SepalWidthCm))

skewness(iris$SepalWidthCm)

## [1] 0.3307028

2.5：数据相似性度量

## 相关系数
cor(iris[,c(2:5)])

##               SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm     1.0000000   -0.1093692     0.8717542    0.8179536
## SepalWidthCm     -0.1093692    1.0000000    -0.4205161   -0.3565441
## PetalLengthCm     0.8717542   -0.4205161     1.0000000    0.9627571
## PetalWidthCm      0.8179536   -0.3565441     0.9627571    1.0000000

## 数据之间的距离
## 计算3种花之间的4个特征均值，然后计算他们之间的距离
## 数据准备
library(dplyr)
newdata <- iris%>%group_by(Species)%>%
  summarise(SepalLengthMean = mean(SepalLengthCm),
            SepalWidthMean = mean(SepalWidthCm),
            PetalLengthMean = mean(PetalLengthCm),
            PetalWidthMean = mean(PetalWidthCm))

## `summarise()` ungrouping output (override with `.groups` argument)

rownames(newdata) <- newdata$Species

## Warning: Setting row names on a tibble is deprecated.

newdata$Species <- NULL
newdata

## # A tibble: 3 x 4
##   SepalLengthMean SepalWidthMean PetalLengthMean PetalWidthMean
##             <dbl>          <dbl>           <dbl>          <dbl>
## 1            5.01           3.42            1.46          0.244
## 2            5.94           2.77            4.26          1.33 
## 3            6.59           2.97            5.55          2.03

##  欧式距离等
dist(newdata,method = "euclidean",upper = T,diag = T)

##          1        2        3
## 1 0.000000 3.205175 4.752592
## 2 3.205175 0.000000 1.620489
## 3 4.752592 1.620489 0.000000

## 曼哈顿距离
dist(newdata,method = "manhattan",upper = T,diag = T)

##       1     2     3
## 1 0.000 5.456 7.896
## 2 5.456 0.000 2.848
## 3 7.896 2.848 0.000

## maximum
dist(newdata,method = "maximum",upper = T,diag = T)

##       1     2     3
## 1 0.000 2.796 4.088
## 2 2.796 0.000 1.292
## 3 4.088 1.292 0.000

## canberra
dist(newdata,method = "canberra",upper = T,diag = T)

##           1         2         3
## 1 0.0000000 1.3673540 1.5736019
## 2 1.3673540 0.0000000 0.4280814
## 3 1.5736019 0.4280814 0.0000000

## minkowski
dist(newdata,method = "minkowski",upper = T,diag = T,p = 0.5)

##          1        2        3
## 1  0.00000 20.08533 27.88796
## 2 20.08533  0.00000 10.44874
## 3 27.88796 10.44874  0.00000