## 1:读取CSV文件
## 方式1
csvdata <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(csvdata)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 6 5.4 3.9 1.7 0.4 Iris-setosa
str(csvdata)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 方式2
csvdata <- read.table("data/chap2/Iris.csv",header = TRUE,sep = ",")
head(csvdata)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4 4.6 3.1 1.5 0.2 Iris-setosa
## 5 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 6 5.4 3.9 1.7 0.4 Iris-setosa
str(csvdata)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 方式3 c = character, i = integer, n = number, d = double, l = logical, D = date, ## T = date time, t = time, ? = guess
library(readr)
csvdata <- read_csv("data/chap2/Iris.csv",col_names = TRUE,
col_types = list("d","d","d","d","d","c"))
head(csvdata,2)
## # A tibble: 2 x 6
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3 1.4 0.2 Iris-setosa
str(csvdata)
## tibble [150 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Id : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## - attr(*, "spec")=
## .. cols(
## .. Id = col_double(),
## .. SepalLengthCm = col_double(),
## .. SepalWidthCm = col_double(),
## .. PetalLengthCm = col_double(),
## .. PetalWidthCm = col_double(),
## .. Species = col_character()
## .. )
## 数据保存为csv
write_csv(csvdata,"data/chap2/IrisWrite_1.csv")
write.csv(csvdata,"data/chap2/IrisWrite_2.csv",quote = FALSE)
## 2: 读取excel数据
library(readxl)
exceldata <- read_excel("data/chap2/Iris.xlsx",sheet = "Iris")
str(exceldata,2)
## tibble [150 × 6] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 读取spss数据
library(foreign)
spssdata <- read.spss("data/chap2/Iris_spss.sav",to.data.frame = TRUE)
head(spssdata,2)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
str(spssdata)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa " "Iris-setosa " "Iris-setosa " "Iris-setosa " ...
## - attr(*, "codepage")= int 65001
## 读取spss数据
library(haven)
spssdata <- read_sav("data/chap2/Iris_spss.sav")
head(spssdata,2)
## # A tibble: 2 x 6
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3 1.4 0.2 Iris-setosa
str(spssdata)
## tibble [150 × 6] (S3: tbl_df/tbl/data.frame)
## $ Id : num [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## ..- attr(*, "format.spss")= chr "F11.0"
## ..- attr(*, "display_width")= int 11
## $ SepalLengthCm: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## ..- attr(*, "format.spss")= chr "F11.1"
## ..- attr(*, "display_width")= int 11
## $ SepalWidthCm : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## ..- attr(*, "format.spss")= chr "F11.1"
## ..- attr(*, "display_width")= int 11
## $ PetalLengthCm: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## ..- attr(*, "format.spss")= chr "F11.1"
## ..- attr(*, "display_width")= int 11
## $ PetalWidthCm : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## ..- attr(*, "format.spss")= chr "F11.1"
## ..- attr(*, "display_width")= int 11
## $ Species : chr [1:150] "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## ..- attr(*, "format.spss")= chr "A15"
## ..- attr(*, "display_width")= int 15
## 方法2
spssdata <- read_spss("data/chap2/Iris_spss.sav")
head(spssdata,2)
## # A tibble: 2 x 6
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3 1.4 0.2 Iris-setosa
## 读取SAS数据
sasdata <- read_sas("data/chap2/iris.sas7bdat")
head(sasdata,2)
## # A tibble: 2 x 5
## Sepal_Length Sepal_Width Petal_Length Petal_Width Species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 读取stata数据
dtadata <- read_dta("data/chap2/iris.dta")
head(dtadata,2)
## # A tibble: 2 x 5
## sepallength sepalwidth petallength petalwidth species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.10 3.5 1.40 0.200 setosa
## 2 4.90 3 1.40 0.200 setosa
str(dtadata)
## tibble [150 × 5] (S3: tbl_df/tbl/data.frame)
## $ sepallength: num [1:150] 5.1 4.9 4.7 4.6 5 ...
## ..- attr(*, "label")= chr "Sepal.Length"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ sepalwidth : num [1:150] 3.5 3 3.2 3.1 3.6 ...
## ..- attr(*, "label")= chr "Sepal.Width"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ petallength: num [1:150] 1.4 1.4 1.3 1.5 1.4 ...
## ..- attr(*, "label")= chr "Petal.Length"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ petalwidth : num [1:150] 0.2 0.2 0.2 0.2 0.2 ...
## ..- attr(*, "label")= chr "Petal.Width"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ species : chr [1:150] "setosa" "setosa" "setosa" "setosa" ...
## ..- attr(*, "label")= chr "Species"
## ..- attr(*, "format.stata")= chr "%10s"
dtadata <- read_stata("data/chap2/iris.dta")
head(dtadata,2)
## # A tibble: 2 x 5
## sepallength sepalwidth petallength petalwidth species
## <dbl> <dbl> <dbl> <dbl> <chr>
## 1 5.10 3.5 1.40 0.200 setosa
## 2 4.90 3 1.40 0.200 setosa
str(dtadata)
## tibble [150 × 5] (S3: tbl_df/tbl/data.frame)
## $ sepallength: num [1:150] 5.1 4.9 4.7 4.6 5 ...
## ..- attr(*, "label")= chr "Sepal.Length"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ sepalwidth : num [1:150] 3.5 3 3.2 3.1 3.6 ...
## ..- attr(*, "label")= chr "Sepal.Width"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ petallength: num [1:150] 1.4 1.4 1.3 1.5 1.4 ...
## ..- attr(*, "label")= chr "Petal.Length"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ petalwidth : num [1:150] 0.2 0.2 0.2 0.2 0.2 ...
## ..- attr(*, "label")= chr "Petal.Width"
## ..- attr(*, "format.stata")= chr "%9.0g"
## $ species : chr [1:150] "setosa" "setosa" "setosa" "setosa" ...
## ..- attr(*, "label")= chr "Species"
## ..- attr(*, "format.stata")= chr "%10s"
## 读取matlab数据文件
library(R.matlab)
## R.matlab v3.6.2 (2018-09-26) successfully loaded. See ?R.matlab for help.
##
## Attaching package: 'R.matlab'
## The following objects are masked from 'package:base':
##
## getOption, isOpen
matdata <- readMat("data/chap2/ABC.mat")
str(matdata)
## List of 3
## $ A: int [1:9, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
## $ B: int [1:10, 1] 1 2 3 4 5 6 7 8 9 10
## $ C: int [1:2, 1:3, 1:3] 1 2 3 4 5 6 7 8 9 10 ...
## - attr(*, "header")=List of 3
## ..$ description: chr "MATLAB 5.0 MAT-file, Platform: windows, Software: R v2.15.0, Created on: Sat Mar 31 19:50:00 2012 "
## ..$ version : chr "5"
## ..$ endian : chr "little"
head(matdata$A,2)
## [,1] [,2] [,3]
## [1,] 1 10 19
## [2,] 2 11 20
## 读取图片数据
## 读取png图像
library(png)
impng <- readPNG("data/chap2/Rlogo.png")
r <- nrow(impng) / ncol(impng) # image ratio
plot(c(0,1), c(0,r), type = "n", xlab = "", ylab = "", asp=1)
## 该行在Nootbook中不支持,但是在Console中运行正常
rasterImage(impng, 0, 0, 1, r)
str(impng)
## num [1:76, 1:100, 1:4] 0 0 0 0 0 0 0 0 0 0 ...
## load.image 可以读取多种格式的图像
library(imager)
## Loading required package: magrittr
##
## Attaching package: 'imager'
## The following object is masked from 'package:magrittr':
##
## add
## The following objects are masked from 'package:stats':
##
## convolve, spectrum
## The following object is masked from 'package:graphics':
##
## frame
## The following object is masked from 'package:base':
##
## save.image
imjpg <- load.image("data/chap2/image.jpg")
imdim <- dim(imjpg)
plot(imjpg,xlim = c(1,width(imjpg)),ylim = c(1,height(imjpg)))
library(XML)
## 获取网页中的链接,检查R官网都有哪些链接
fileURL <- "https://www.r-project.org/"
fileURLnew <- sub("https", "http", fileURL)
links <- getHTMLLinks(fileURLnew)
length(links)
## [1] 38
## 从网页中读取数据表格,公牛队球员的数据
fileURL <- "http://www.stat-nba.com/team/CHI.html"
Tab <- readHTMLTable(fileURL)
length(Tab)
## [1] 2
NBAmember <- Tab[[1]]
head(NBAmember)
## 球员 出场 首发 时间 投篮 命中 出手 三分 命中 出手 罚球 命中 出手
## 1 扎克-拉文 60 60 34.8 44.9% 9.0 20.0 38.0% 3.1 8.1 80.2% 4.5 5.6
## 2 劳里-马尔卡宁 50 50 29.8 42.5% 5.0 11.8 34.4% 2.2 6.3 82.4% 2.5 3.1
## 3 科比-怀特 65 1 25.9 39.4% 4.8 12.2 35.4% 2.0 5.8 79.1% 1.6 2.0
## 4 奥托-波特 14 9 23.5 44.3% 4.4 10.0 38.7% 1.7 4.4 70.4% 1.4 1.9
## 5 温德尔-卡特 43 43 29.2 53.4% 4.3 8.0 20.7% 0.1 0.7 73.7% 2.6 3.5
## 6 赛迪斯-杨 64 16 24.9 44.7% 4.2 9.4 35.4% 1.2 3.5 58.3% 0.7 1.1
## 篮板 前场 后场 助攻 抢断 盖帽 失误 犯规 得分
## 1 4.8 0.7 4.1 4.2 1.5 0.5 3.4 2.2 25.5
## 2 6.3 1.2 5.1 1.5 0.8 0.5 1.6 1.9 14.7
## 3 3.6 0.4 3.1 2.7 0.8 0.1 1.7 1.8 13.2
## 4 3.4 0.9 2.5 1.8 1.1 0.4 0.8 2.2 11.9
## 5 9.4 3.2 6.2 1.2 0.8 0.8 1.7 3.8 11.3
## 6 4.9 1.5 3.5 1.8 1.4 0.4 1.6 2.1 10.3
## 使用rvest包获取网络数据
library(rvest)
## Loading required package: xml2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
## The following object is masked from 'package:readr':
##
## guess_encoding
library(stringr)
##
## Attaching package: 'stringr'
## The following object is masked from 'package:imager':
##
## boundary
## 读取网页,获取电影的名称
top250 <- read_html("https://movie.douban.com/top250")
title <-top250 %>% html_nodes("span.title") %>% html_text()
head(title)
## [1] "肖申克的救赎" " / The Shawshank Redemption"
## [3] "霸王别姬" "阿甘正传"
## [5] " / Forrest Gump" "这个杀手不太冷"
## 获取第一个名字
title <- title[is.na(str_match(title,"/"))]
head(title)
## [1] "肖申克的救赎" "霸王别姬" "阿甘正传" "这个杀手不太冷"
## [5] "泰坦尼克号" "美丽人生"
## 获取电影的评分
score <-top250 %>% html_nodes("span.rating_num") %>% html_text()
filmdf <- data.frame(title = title,score = as.numeric(score))
## 获取电影的主题
term <-top250 %>% html_nodes("span.inq") %>% html_text()
filmdf$term <- term
head(filmdf)
## title score term
## 1 肖申克的救赎 9.7 希望让人自由。
## 2 霸王别姬 9.6 风华绝代。
## 3 阿甘正传 9.5 一部美国近现代史。
## 4 这个杀手不太冷 9.4 怪蜀黍和小萝莉不得不说的故事。
## 5 泰坦尼克号 9.4 失去的才是永恒的。
## 6 美丽人生 9.6 最美的谎言。
很多时候数据不会是完整的,会存在有缺失值的情况,这时需要对缺失的数据进行处理。
##读取数据
myair <- read.csv("data/chap2/myairquality.csv")
dim(myair)
## [1] 153 7
summary(myair)
## Ozone Solar.R Wind Temp
## Min. : 1.00 Min. : 7.0 Min. : 1.70 Min. :56.00
## 1st Qu.: 18.00 1st Qu.:115.8 1st Qu.: 7.40 1st Qu.:72.25
## Median : 31.50 Median :205.0 Median : 9.70 Median :79.00
## Mean : 42.13 Mean :185.9 Mean :10.01 Mean :77.87
## 3rd Qu.: 63.25 3rd Qu.:258.8 3rd Qu.:11.50 3rd Qu.:84.00
## Max. :168.00 Max. :334.0 Max. :20.70 Max. :97.00
## NA's :37 NA's :7 NA's :4 NA's :3
## Month Day Type
## Min. :5.000 Min. : 1.00 Length:153
## 1st Qu.:6.000 1st Qu.: 8.00 Class :character
## Median :7.000 Median :16.00 Mode :character
## Mean :6.993 Mean :15.71
## 3rd Qu.:8.000 3rd Qu.:23.00
## Max. :9.000 Max. :31.00
## NA's :3 NA's :6
## 1:检查数据是否存在缺失值
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
##
## Attaching package: 'grid'
## The following object is masked from 'package:imager':
##
## depth
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
## 可视化查看数据是否有缺失值
aggr(myair)
## complete.cases()输出样例是否包含缺失值
## 输出包含缺失值的样例
mynadata <- myair[!complete.cases(myair),]
dim(mynadata)
## [1] 57 7
head(mynadata)
## Ozone Solar.R Wind Temp Month Day Type
## 2 36 118 8.0 72 5 2 <NA>
## 5 NA NA 14.3 56 5 5 C
## 6 28 NA 14.9 66 5 6 B
## 8 19 99 13.8 NA 5 8 A
## 10 NA 194 8.6 69 5 10 C
## 11 7 NA 6.9 74 5 11 B
## matrixplot()可视化缺失值的详细情况
## 红色代表缺失数据的情况
matrixplot(mynadata)
##
## Click in a column to sort by the corresponding variable.
## To regain use of the VIM GUI and the R console, click outside the plot region.
## 只保留没有缺失值的样例
newdata <- na.omit(myair)
dim(newdata)
## [1] 96 7
head(newdata)
## Ozone Solar.R Wind Temp Month Day Type
## 1 41 190 7.4 67 5 1 A
## 3 12 149 12.6 74 5 3 A
## 4 18 313 11.5 62 5 4 C
## 7 23 299 8.6 65 5 7 B
## 9 8 19 20.1 61 5 9 C
## 12 16 256 9.7 69 5 12 B
## 简单的方法
## 针对不同的情况和变量属性,可以使用不同的缺失值处理方法
## 1: 填补缺失值:
## 均值,中位数,众数等
## is.na()查看Ozone(臭氧)数据缺失值的位置
myair2 <- myair
## 使用均值填补缺失值
myair2$Ozone[is.na(myair$Ozone)] <- mean(myair$Ozone,na.rm = TRUE)
## 输出哪些位置有缺失值
which(is.na(myair$Solar.R))
## [1] 5 6 11 27 96 97 98
## 使用中位数填补缺失值
myair2$Solar.R[which(is.na(myair$Solar.R))] <- median(myair2$Solar.R,na.rm = TRUE)
## 使用前面的或者后面的数据填补缺失值
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## 使用前面或者后面的值来填补缺失值
myair2$Wind <- na.locf(myair$Wind)
myair2$Temp <- na.locf(myair$Temp,fromLast = TRUE)
## 数据中月份数据可以使用前面和后面数据的平均值来填补
## 找到缺失值的位置
naindex <- which(is.na(myair$Month))
newnamonth <- round((myair$Month[naindex-1] + myair$Month[naindex+1]) / 2)
myair2$Month[naindex] <- newnamonth
## 日期数据根据数据情况可以使用前面的数值+1
naindex <- which(is.na(myair$Day))
newnaday <- myair$Day[naindex-1] + 1
myair2$Day[naindex] <- newnaday
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following object is masked from 'package:rvest':
##
## html
## The following object is masked from 'package:imager':
##
## label
## The following objects are masked from 'package:base':
##
## format.pval, units
## 使用众数填补缺失值Type变量
## 找出众数
table(myair$Type)
##
## A B C
## 39 51 58
myair2$Type <- impute(myair$Type,"C")
## 观察处理后新数据集的缺失值情况
aggr(myair2)
复杂的数据缺失值处理方法
## 复杂的缺失值处理方法
colnames(myair)
## [1] "Ozone" "Solar.R" "Wind" "Temp" "Month" "Day" "Type"
## 考虑"Ozone" "Solar.R" "Wind" "Temp"之间有关系对四个特征进行缺失值处理
## 提取数据
myair <- myair[,c(1:4)]
## 使用KNN方法来填补缺失值
library(DMwR2)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'DMwR2'
## The following object is masked from 'package:VIM':
##
## kNN
myair2 <- knnImputation(myair,k=5,scale = TRUE,meth = "weighAvg")
## 使用随机森林的方式填补缺失值
library(missForest)
## Loading required package: randomForest
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:imager':
##
## grow
## Loading required package: foreach
## Loading required package: itertools
## Loading required package: iterators
##
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
##
## nrmse
myair2 <- missForest(myair,ntree = 50)
## missForest iteration 1 in progress...done!
## missForest iteration 2 in progress...done!
## missForest iteration 3 in progress...done!
## missForest iteration 4 in progress...done!
## 填补缺失值后的数据
myair2$ximp
## Ozone Solar.R Wind Temp
## 1 41.00000 190.00 7.400 67.00000
## 2 36.00000 118.00 8.000 72.00000
## 3 12.00000 149.00 12.600 74.00000
## 4 18.00000 313.00 11.500 62.00000
## 5 20.12000 143.44 14.300 56.00000
## 6 28.00000 232.80 14.900 66.00000
## 7 23.00000 299.00 8.600 65.00000
## 8 19.00000 99.00 13.800 70.50000
## 9 8.00000 19.00 20.100 61.00000
## 10 26.60000 194.00 8.600 69.00000
## 11 7.00000 53.22 6.900 74.00000
## 12 16.00000 256.00 9.700 69.00000
## 13 11.00000 290.00 9.200 66.00000
## 14 14.00000 274.00 10.900 68.00000
## 15 18.00000 65.00 13.200 58.00000
## 16 14.00000 334.00 11.500 64.00000
## 17 34.00000 307.00 12.000 66.00000
## 18 6.00000 78.00 18.400 57.00000
## 19 30.00000 322.00 11.500 68.00000
## 20 11.00000 44.00 9.700 62.00000
## 21 1.00000 8.00 9.700 59.00000
## 22 11.00000 320.00 12.578 73.00000
## 23 4.00000 25.00 9.700 61.00000
## 24 32.00000 92.00 12.000 61.00000
## 25 11.50000 66.00 16.600 57.00000
## 26 20.62000 266.00 14.900 58.00000
## 27 29.04000 154.80 8.000 57.00000
## 28 23.00000 13.00 12.000 67.00000
## 29 45.00000 252.00 14.900 81.00000
## 30 115.00000 223.00 5.700 79.00000
## 31 37.00000 279.00 7.400 76.00000
## 32 35.30000 286.00 8.600 78.00000
## 33 20.18000 287.00 9.700 74.00000
## 34 25.04000 242.00 16.100 67.00000
## 35 48.98800 186.00 9.200 84.00000
## 36 81.14000 220.00 8.600 85.00000
## 37 31.62833 264.00 14.300 79.00000
## 38 29.00000 127.00 9.700 82.00000
## 39 71.10000 273.00 6.900 87.00000
## 40 71.00000 291.00 13.800 90.00000
## 41 39.00000 323.00 11.500 87.00000
## 42 81.18000 259.00 10.900 93.00000
## 43 77.74000 250.00 9.200 92.00000
## 44 23.00000 148.00 8.000 82.00000
## 45 33.00833 332.00 13.800 80.00000
## 46 34.10000 322.00 11.500 79.00000
## 47 21.00000 191.00 14.900 77.00000
## 48 37.00000 284.00 20.700 72.00000
## 49 20.00000 37.00 9.200 65.00000
## 50 12.00000 120.00 11.500 73.00000
## 51 13.00000 137.00 10.300 76.00000
## 52 37.78000 150.00 6.300 77.00000
## 53 87.66000 59.00 1.700 76.00000
## 54 57.18000 91.00 4.600 76.00000
## 55 46.18000 250.00 6.300 76.00000
## 56 20.77333 135.00 8.000 75.00000
## 57 36.43867 127.00 8.000 78.00000
## 58 10.12000 47.00 10.300 73.00000
## 59 35.40000 98.00 11.500 80.00000
## 60 12.72000 31.00 14.900 77.00000
## 61 43.70000 138.00 8.000 83.00000
## 62 135.00000 269.00 4.100 84.00000
## 63 49.00000 248.00 9.200 85.00000
## 64 32.00000 236.00 9.200 81.00000
## 65 40.84800 101.00 10.900 84.00000
## 66 64.00000 175.00 4.600 83.00000
## 67 40.00000 314.00 10.900 83.00000
## 68 77.00000 276.00 5.100 88.00000
## 69 97.00000 267.00 6.300 92.00000
## 70 97.00000 272.00 5.700 92.00000
## 71 85.00000 175.00 6.092 89.00000
## 72 24.41800 139.00 8.600 82.00000
## 73 10.00000 264.00 14.300 73.00000
## 74 27.00000 175.00 14.900 81.00000
## 75 20.46000 291.00 14.900 70.41633
## 76 7.00000 48.00 14.300 80.00000
## 77 48.00000 260.00 6.900 81.00000
## 78 35.00000 274.00 10.300 82.00000
## 79 61.00000 285.00 6.300 84.00000
## 80 79.00000 187.00 6.336 87.00000
## 81 63.00000 220.00 11.500 85.00000
## 82 16.00000 7.00 6.900 74.00000
## 83 45.22000 258.00 9.700 81.00000
## 84 31.62000 295.00 11.500 82.00000
## 85 80.00000 294.00 8.600 86.00000
## 86 108.00000 223.00 8.000 85.00000
## 87 20.00000 81.00 8.600 82.00000
## 88 52.00000 82.00 12.000 86.00000
## 89 82.00000 213.00 7.400 88.00000
## 90 50.00000 275.00 7.400 82.40000
## 91 64.00000 253.00 7.400 83.00000
## 92 59.00000 254.00 9.200 81.00000
## 93 39.00000 83.00 6.900 81.00000
## 94 9.00000 24.00 13.800 81.00000
## 95 16.00000 77.00 7.400 82.00000
## 96 78.00000 244.72 6.900 86.00000
## 97 35.00000 183.11 7.400 85.00000
## 98 66.00000 208.20 4.600 87.00000
## 99 122.00000 255.00 4.000 89.00000
## 100 89.00000 229.00 10.300 90.00000
## 101 110.00000 207.00 8.000 90.00000
## 102 95.10000 222.00 8.600 92.00000
## 103 48.70000 137.00 11.500 86.00000
## 104 44.00000 192.00 11.500 86.00000
## 105 28.00000 273.00 11.500 82.00000
## 106 65.00000 157.00 9.700 80.00000
## 107 25.70000 64.00 11.500 79.00000
## 108 22.00000 71.00 10.300 77.00000
## 109 59.00000 51.00 6.300 79.00000
## 110 23.00000 115.00 7.400 76.00000
## 111 31.00000 244.00 10.900 78.00000
## 112 44.00000 190.00 10.300 78.00000
## 113 21.00000 259.00 15.500 77.00000
## 114 9.00000 36.00 14.300 72.00000
## 115 17.28000 255.00 12.600 75.00000
## 116 45.00000 212.00 9.700 79.00000
## 117 168.00000 238.00 3.400 81.00000
## 118 73.00000 215.00 8.000 86.00000
## 119 78.44000 153.00 5.700 88.00000
## 120 76.00000 203.00 9.700 97.00000
## 121 118.00000 225.00 6.350 94.00000
## 122 84.00000 237.00 6.300 96.00000
## 123 85.00000 188.00 6.300 94.00000
## 124 96.00000 167.00 6.900 91.00000
## 125 78.00000 197.00 5.100 92.00000
## 126 73.00000 183.00 2.800 93.00000
## 127 91.00000 189.00 4.600 93.00000
## 128 47.00000 95.00 7.400 87.00000
## 129 32.00000 92.00 15.500 84.00000
## 130 20.00000 252.00 10.900 80.00000
## 131 23.00000 220.00 10.300 78.00000
## 132 21.00000 230.00 10.900 75.00000
## 133 24.00000 259.00 9.700 73.00000
## 134 44.00000 236.00 14.900 81.00000
## 135 21.00000 259.00 15.500 76.00000
## 136 28.00000 238.00 6.300 77.00000
## 137 9.00000 24.00 10.900 71.00000
## 138 13.00000 112.00 11.500 71.00000
## 139 46.00000 237.00 6.900 78.00000
## 140 18.00000 224.00 13.800 67.00000
## 141 13.00000 27.00 10.300 76.00000
## 142 24.00000 238.00 10.300 68.00000
## 143 16.00000 201.00 8.000 82.00000
## 144 13.00000 238.00 12.600 64.00000
## 145 23.00000 14.00 9.200 71.00000
## 146 36.00000 139.00 10.300 81.00000
## 147 7.00000 49.00 10.300 69.00000
## 148 14.00000 20.00 16.600 63.00000
## 149 30.00000 193.00 6.900 70.00000
## 150 16.68000 145.00 13.200 77.00000
## 151 14.00000 191.00 14.300 75.00000
## 152 18.00000 131.00 8.000 76.00000
## 153 20.00000 223.00 11.500 68.00000
## OOB误差
myair2$OOBerror
## NRMSE
## 0.5792485
## 缺失值多重插补
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:imager':
##
## squeeze
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
## 进行链式方程的多元插补
## m:多重插补的数量
## method : 指定插补方法
## norm.predict : 线性回归预测;pmm:均值插补方法,rf: 随机森林方法
## norm:贝叶斯线性回归
impdta <- mice(myair,m = 5,method=c("norm.predict","pmm","rf","norm"))
##
## iter imp variable
## 1 1 Ozone Solar.R Wind Temp
## 1 2 Ozone Solar.R Wind Temp
## 1 3 Ozone Solar.R Wind Temp
## 1 4 Ozone Solar.R Wind Temp
## 1 5 Ozone Solar.R Wind Temp
## 2 1 Ozone Solar.R Wind Temp
## 2 2 Ozone Solar.R Wind Temp
## 2 3 Ozone Solar.R Wind Temp
## 2 4 Ozone Solar.R Wind Temp
## 2 5 Ozone Solar.R Wind Temp
## 3 1 Ozone Solar.R Wind Temp
## 3 2 Ozone Solar.R Wind Temp
## 3 3 Ozone Solar.R Wind Temp
## 3 4 Ozone Solar.R Wind Temp
## 3 5 Ozone Solar.R Wind Temp
## 4 1 Ozone Solar.R Wind Temp
## 4 2 Ozone Solar.R Wind Temp
## 4 3 Ozone Solar.R Wind Temp
## 4 4 Ozone Solar.R Wind Temp
## 4 5 Ozone Solar.R Wind Temp
## 5 1 Ozone Solar.R Wind Temp
## 5 2 Ozone Solar.R Wind Temp
## 5 3 Ozone Solar.R Wind Temp
## 5 4 Ozone Solar.R Wind Temp
## 5 5 Ozone Solar.R Wind Temp
summary(impdta)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## Ozone Solar.R Wind Temp
## "norm.predict" "pmm" "rf" "norm"
## PredictorMatrix:
## Ozone Solar.R Wind Temp
## Ozone 0 1 1 1
## Solar.R 1 0 1 1
## Wind 1 1 0 1
## Temp 1 1 1 0
长宽数据变换,数据标准化处理,数据集切分
### 长宽数据变换
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:imager':
##
## fill
## The following object is masked from 'package:magrittr':
##
## extract
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
head(Iris,2)
## Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 2 4.9 3.0 1.4 0.2 Iris-setosa
str(Iris)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## 宽数据转化为长数据1
Irislong = gather(Iris,key="varname",value="value",SepalLengthCm:PetalWidthCm)
head(Irislong,2)
## Id Species varname value
## 1 1 Iris-setosa SepalLengthCm 5.1
## 2 2 Iris-setosa SepalLengthCm 4.9
str(Irislong)
## 'data.frame': 600 obs. of 4 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Species: chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## $ varname: chr "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" "SepalLengthCm" ...
## $ value : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## 长数据转化为宽数据1
IrisWidth <- spread(Irislong,key="varname",value="value")
head(IrisWidth,2)
## Id Species PetalLengthCm PetalWidthCm SepalLengthCm SepalWidthCm
## 1 1 Iris-setosa 1.4 0.2 5.1 3.5
## 2 2 Iris-setosa 1.4 0.2 4.9 3.0
str(IrisWidth)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
## 宽数据转化为长数据2
Irislong = melt(Iris,id = c("Id","Species"),variable.name = "varname",
value.name="value")
head(Irislong,2)
## Id Species varname value
## 1 1 Iris-setosa SepalLengthCm 5.1
## 2 2 Iris-setosa SepalLengthCm 4.9
str(Irislong)
## 'data.frame': 600 obs. of 4 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Species: chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## $ varname: Factor w/ 4 levels "SepalLengthCm",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## 长数据转化为宽数据2
IrisWidth <- dcast(Irislong,Id+Species~varname)
head(IrisWidth,2)
## Id Species SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 1 Iris-setosa 5.1 3.5 1.4 0.2
## 2 2 Iris-setosa 4.9 3.0 1.4 0.2
str(IrisWidth)
## 'data.frame': 150 obs. of 6 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Species : chr "Iris-setosa" "Iris-setosa" "Iris-setosa" "Iris-setosa" ...
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Irisgroup <- Iris%>%
## 根据一个或多个变量分组
group_by(Species)%>%
## 将多个值减少到单个值
summarise(meanSL = mean(SepalLengthCm),
medianSW = median(SepalWidthCm),
sdPL = sd(PetalLengthCm),
IQRPW = IQR(PetalWidthCm),
num = n()) %>%
## 按变量排列行
arrange(desc(sdPL))%>%
## 返回具有匹配条件的行
filter(num==50)%>%
## 添加新的变量
mutate(varPL = sdPL^2)
## `summarise()` ungrouping output (override with `.groups` argument)
Irisgroup
## # A tibble: 3 x 7
## Species meanSL medianSW sdPL IQRPW num varPL
## <chr> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 Iris-virginica 6.59 3 0.552 0.500 50 0.305
## 2 Iris-versicolor 5.94 2.8 0.470 0.3 50 0.221
## 3 Iris-setosa 5.01 3.4 0.174 0.100 50 0.0301
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:5]
head(Iris,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
str(Iris)
## 'data.frame': 150 obs. of 4 variables:
## $ SepalLengthCm: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ SepalWidthCm : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ PetalLengthCm: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ PetalWidthCm : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## 数据中心化:是指变量减去它的均值;
Irisc <- scale(Iris,center = TRUE, scale = FALSE)
apply(Irisc,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.543333 -1.054 -2.758667 -1.098667
## [2,] 2.056667 1.346 3.141333 1.301333
## 数据标准化:是指数值减去均值,再除以标准差;
## 数据标准化处理
Iriss <- scale(Iris,center = TRUE, scale = TRUE)
apply(Iriss,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.863780 -2.430844 -1.563497 -1.439627
## [2,] 2.483699 3.104284 1.780377 1.705189
## min-max标准化方法是对原始数据进行线性变换。
## 设minA和maxA分别为属性A的最小值和最大值,
## 将A的一个原始值x通过min-max标准化映射成在区间[0,1]中的值
## 新数据=(原数据-最小值)/(最大值-最小值)
minmax <- function(x){
x <- (x-min(x))/(max(x)-min(x))
}
Iris01 <- apply(Iris,2,minmax)
apply(Iris01,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 0 0 0 0
## [2,] 1 1 1 1
## 使用caret包进行处理
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## preProcess得到的结果可以使用predict函数作用于新的数据集
## 而且还包括其他方法,如标准化 "scale", "range", 等
## 1 中心化
center <- preProcess(Iris,method = "center")
Irisc <- predict(center,Iris)
head(Irisc,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 -0.7433333 0.446 -2.358667 -0.9986667
## 2 -0.9433333 -0.054 -2.358667 -0.9986667
apply(Irisc,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.543333 -1.054 -2.758667 -1.098667
## [2,] 2.056667 1.346 3.141333 1.301333
## 2 标准化
scal <- preProcess(Iris,method = c("center","scale"))
Iriss <- predict(scal,Iris)
head(Iriss,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1 -0.8976739 1.0286113 -1.336794 -1.308593
## 2 -1.1392005 -0.1245404 -1.336794 -1.308593
apply(Iriss,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] -1.863780 -2.430844 -1.563497 -1.439627
## [2,] 2.483699 3.104284 1.780377 1.705189
## [0-1]化
minmax01 <- preProcess(Iris,method = "range",rangeBounds = c(0,1))
Iris01 <- predict(minmax01,Iris)
apply(Iris01,2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 0 0 0 0
## [2,] 1 1 1 1
## 数据集切分
Iris <- read.csv("data/chap2/Iris.csv",header = TRUE)
Iris <- Iris[2:6]
head(Iris,2)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 数据集切分 1
num <- round(nrow(Iris)*0.7)
index <- sample(nrow(Iris),size = num)
Iris_train <- Iris[index,]
Iris_test <- Iris[-index,]
dim(Iris_train)
## [1] 105 5
dim(Iris_test)
## [1] 45 5
## 数据集切分2 使用carte包中的函数
## carte包中切分数据的输出为训练数据集在所有数据中的行位置
## 使用createDataPartition获取数据切分的索引
index = createDataPartition(Iris$Species,p=0.7)
Iris_train <- Iris[index$Resample1,]
Iris_test <- Iris[-index$Resample1,]
dim(Iris_train)
## [1] 105 5
dim(Iris_test)
## [1] 45 5
## 获取数据k折的行位置
index2 <- createFolds(Iris$Species,k = 3)
index2
## $Fold1
## [1] 3 7 11 12 15 19 21 23 27 29 30 37 38 40 41 45 49 55 56
## [20] 57 64 65 69 70 71 73 74 75 79 85 86 87 94 99 105 106 111 114
## [39] 116 122 123 125 128 129 133 134 137 139 140 142
##
## $Fold2
## [1] 2 9 10 16 17 18 20 25 28 31 32 33 34 36 43 46 50 51 53
## [20] 59 61 66 67 68 76 82 84 91 93 95 96 97 98 101 103 107 112 113
## [39] 115 117 118 119 124 131 132 138 141 143 144 148
##
## $Fold3
## [1] 1 4 5 6 8 13 14 22 24 26 35 39 42 44 47 48 52 54 58
## [20] 60 62 63 72 77 78 80 81 83 88 89 90 92 100 102 104 108 109 110
## [39] 120 121 126 127 130 135 136 145 146 147 149 150
iris <- read.csv("data/chap2/Iris.csv")
## 数据的集中趋势
## 均值
apply(iris[,c(2:5)],2,mean)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 5.843333 3.054000 3.758667 1.198667
## 中位数
apply(iris[,c(2:5)],2,median)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 5.80 3.00 4.35 1.30
## 离散程度
## 方差
apply(iris[,c(2:5)],2,var)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.6856935 0.1880040 3.1131794 0.5824143
## 标准差
apply(iris[,c(2:5)],2,sd)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.8280661 0.4335943 1.7644204 0.7631607
## 中位数绝对偏差
apply(iris[,c(2:5)],2,mad)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1.03782 0.37065 1.85325 1.03782
## 变异系数 标准差/均值,越大说明数据越分散
apply(iris[,c(2:5)],2,sd) / apply(iris[,c(2:5)],2,mean)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.1417113 0.1419759 0.4694272 0.6366747
## 四分位数 和 极值
apply(iris[,c(2:5)],2,quantile)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0% 4.3 2.0 1.00 0.1
## 25% 5.1 2.8 1.60 0.3
## 50% 5.8 3.0 4.35 1.3
## 75% 6.4 3.3 5.10 1.8
## 100% 7.9 4.4 6.90 2.5
apply(iris[,c(2:5)],2,fivenum)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 4.3 2.0 1.00 0.1
## [2,] 5.1 2.8 1.60 0.3
## [3,] 5.8 3.0 4.35 1.3
## [4,] 6.4 3.3 5.10 1.8
## [5,] 7.9 4.4 6.90 2.5
apply(iris[,c(2:5)],2,range)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## [1,] 4.3 2.0 1.0 0.1
## [2,] 7.9 4.4 6.9 2.5
## 四分位数范围 IQR(x) = quantile(x, 3/4) - quantile(x, 1/4).
apply(iris[,c(2:5)],2,IQR)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 1.3 0.5 3.5 1.5
## 偏度和峰度,可以使用moments库
library(moments)
apply(iris[,c(2:5)],2,skewness)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 0.3117531 0.3307028 -0.2717120 -0.1039437
apply(iris[,c(2:5)],2,kurtosis)
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## 2.426432 3.241443 1.604641 1.664754
library(ggplot2)
library(tidyr)
## 宽数据转化为长数据
irislong = gather(iris[,c(2:5)],key="varname",
value="value",SepalLengthCm:PetalWidthCm)
## 可视化数据的分布
ggplot(irislong,aes(colour = varname,linetype = varname))+
theme_bw()+geom_density(aes(value),bw = 0.5)
## 可视化数据的分布
ggplot(irislong,aes(colour = varname,fill = varname,linetype = varname))+
theme_bw()+geom_density(aes(value),bw = 0.5,alpha = 0.4)
plot(density(iris$SepalWidthCm))
skewness(iris$SepalWidthCm)
## [1] 0.3307028
## 相关系数
cor(iris[,c(2:5)])
## SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm
## SepalLengthCm 1.0000000 -0.1093692 0.8717542 0.8179536
## SepalWidthCm -0.1093692 1.0000000 -0.4205161 -0.3565441
## PetalLengthCm 0.8717542 -0.4205161 1.0000000 0.9627571
## PetalWidthCm 0.8179536 -0.3565441 0.9627571 1.0000000
## 数据之间的距离
## 计算3种花之间的4个特征均值,然后计算他们之间的距离
## 数据准备
library(dplyr)
newdata <- iris%>%group_by(Species)%>%
summarise(SepalLengthMean = mean(SepalLengthCm),
SepalWidthMean = mean(SepalWidthCm),
PetalLengthMean = mean(PetalLengthCm),
PetalWidthMean = mean(PetalWidthCm))
## `summarise()` ungrouping output (override with `.groups` argument)
rownames(newdata) <- newdata$Species
## Warning: Setting row names on a tibble is deprecated.
newdata$Species <- NULL
newdata
## # A tibble: 3 x 4
## SepalLengthMean SepalWidthMean PetalLengthMean PetalWidthMean
## <dbl> <dbl> <dbl> <dbl>
## 1 5.01 3.42 1.46 0.244
## 2 5.94 2.77 4.26 1.33
## 3 6.59 2.97 5.55 2.03
## 欧式距离等
dist(newdata,method = "euclidean",upper = T,diag = T)
## 1 2 3
## 1 0.000000 3.205175 4.752592
## 2 3.205175 0.000000 1.620489
## 3 4.752592 1.620489 0.000000
## 曼哈顿距离
dist(newdata,method = "manhattan",upper = T,diag = T)
## 1 2 3
## 1 0.000 5.456 7.896
## 2 5.456 0.000 2.848
## 3 7.896 2.848 0.000
## maximum
dist(newdata,method = "maximum",upper = T,diag = T)
## 1 2 3
## 1 0.000 2.796 4.088
## 2 2.796 0.000 1.292
## 3 4.088 1.292 0.000
## canberra
dist(newdata,method = "canberra",upper = T,diag = T)
## 1 2 3
## 1 0.0000000 1.3673540 1.5736019
## 2 1.3673540 0.0000000 0.4280814
## 3 1.5736019 0.4280814 0.0000000
## minkowski
dist(newdata,method = "minkowski",upper = T,diag = T,p = 0.5)
## 1 2 3
## 1 0.00000 20.08533 27.88796
## 2 20.08533 0.00000 10.44874
## 3 27.88796 10.44874 0.00000