数据处理第一节：选取列的基本到高级方法

library(tidyverse)

#built-in R dataset
glimpse(msleep)

## Observations: 83
## Variables: 11
## $name <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Grea... ##$ genus        <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bo...
## $vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi... ##$ order        <chr> "Carnivora", "Primates", "Rodentia", "Soricomorph...
## $conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", N... ##$ sleep_total  <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1...
## $sleep_rem <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.... ##$ sleep_cycle  <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.38...
## $awake <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9,... ##$ brainwt      <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0....
## $bodywt <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.4...  选取列 选取列：基础部分 如果目的是选择其中几列，只需在select语句中添加列的名称即可。 添加它们的顺序将决定它们在output中的显示顺序。 msleep %>% select(name, genus, sleep_total, awake) %>% glimpse() ## Observations: 83 ## Variables: 4 ##$ name        <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Great...
## $genus <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos... ##$ sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1,...
## $awake <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9, ...  如果你想添加很多列，可以通过使用chunks提高工作效率，取消选择甚至取消选择列并重新添加它来进行选择 直接。 同时可以请使用start_col：end_col语法选择某些列： msleep %>% select(name:order, sleep_total:sleep_cycle) %>% glimpse ## Observations: 83 ## Variables: 7 ##$ name        <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Great...
## $genus <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos... ##$ vore        <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi"...
## $order <chr> "Carnivora", "Primates", "Rodentia", "Soricomorpha... ##$ sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1,...
## $sleep_rem <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.6... ##$ sleep_cycle <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.383...



msleep %>%
select(-conservation, -(sleep_total:awake)) %>%
glimpse

## Observations: 83
## Variables: 6
## $name <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Greater s... ##$ genus   <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos", "...
## $vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi", "c... ##$ order   <chr> "Carnivora", "Primates", "Rodentia", "Soricomorpha", "...
## $brainwt <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.07000... ##$ bodywt  <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.490, 0...



msleep %>%
select(-(name:awake), conservation) %>%
glimpse

## Observations: 83
## Variables: 3
## $brainwt <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.... ##$ bodywt       <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.4...
## $conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", N...  根据列名特点选择列 如果你有很多具有类似列名的列，你可以通过在select语句中添加starts_with（）ends_with（）contains（）来使用匹配。 msleep %>% select(name, starts_with("sleep")) %>% glimpse ## Observations: 83 ## Variables: 4 ##$ name        <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Great...
## $sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1,... ##$ sleep_rem   <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.6...
## $sleep_cycle <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.383... msleep %>% select(contains("eep"), ends_with("wt")) %>% glimpse ## Observations: 83 ## Variables: 5 ##$ sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1,...
## $sleep_rem <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.6... ##$ sleep_cycle <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.383...
## $brainwt <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.0... ##$ bodywt      <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.49...



根据正则表达式选择列

#selecting based on regex
msleep %>%
select(matches("o.+er")) %>%
glimpse

## Observations: 83
## Variables: 2
## $order <chr> "Carnivora", "Primates", "Rodentia", "Soricomorph... ##$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", N...



根据预先确定的列名选择列

classification <- c("name", "genus", "vore", "order", "conservation")

msleep %>%
select(!!classification)

## # A tibble: 83 x 5
##    name                       genus       vore  order        conservation
##    <chr>                      <chr>       <chr> <chr>        <chr>
##  1 Cheetah                    Acinonyx    carni Carnivora    lc
##  2 Owl monkey                 Aotus       omni  Primates     <NA>
##  3 Mountain beaver            Aplodontia  herbi Rodentia     nt
##  4 Greater short-tailed shrew Blarina     omni  Soricomorpha lc
##  5 Cow                        Bos         herbi Artiodactyla domesticated
##  6 Three-toed sloth           Bradypus    herbi Pilosa       <NA>
##  7 Northern fur seal          Callorhinus carni Carnivora    vu
##  8 Vesper mouse               Calomys     <NA>  Rodentia     <NA>
##  9 Dog                        Canis       carni Carnivora    domesticated
## 10 Roe deer                   Capreolus   herbi Artiodactyla lc
## # ... with 73 more rows



根据数据类型选择列

select_if函数允许您传递返回逻辑语句的函数。 例如，您可以使用select_if（is.character）选择所有字符串列。 同样，你可以添加is.numericis.integeris.doubleis.logicalis.factor。如果你有日期列，你可以加载lubridate包，并使用is.POSIXtis.Date

msleep %>%
select_if(is.numeric) %>%
glimpse

## Observations: 83
## Variables: 6
## $sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1,... ##$ sleep_rem   <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.6...
## $sleep_cycle <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.383... ##$ awake       <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9, ...
## $brainwt <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.0... ##$ bodywt      <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.49...



msleep %>%
select_if(~!is.numeric(.)) %>%
glimpse

## Observations: 83
## Variables: 5
## $name <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Grea... ##$ genus        <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bo...
## $vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi... ##$ order        <chr> "Carnivora", "Primates", "Rodentia", "Soricomorph...
## $conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", N...  按逻辑表达式选择列 实际上，select_if允许您根据任何逻辑函数进行选择，而不仅仅基于数据类型。 例如，可以选择平均值大于500的所有列。 为避免错误，您还必须仅选择数字列，您可以提前执行此操作以获得更简单的语法，也可以在同一行中执行。类似地，'mean> 500本身不是一个函数，所以你需要先添加一个代字号，或者将它包装在funs（）中以将语句转换为函数。 msleep %>% select_if(is.numeric) %>% select_if(~mean(., na.rm=TRUE) > 10)  或者更简便: msleep %>% select_if(~is.numeric(.) & mean(., na.rm=TRUE) > 10) ## # A tibble: 83 x 3 ## sleep_total awake bodywt ## <dbl> <dbl> <dbl> ## 1 12.1 11.9 50.0 ## 2 17.0 7.00 0.480 ## 3 14.4 9.60 1.35 ## 4 14.9 9.10 0.0190 ## 5 4.00 20.0 600 ## 6 14.4 9.60 3.85 ## 7 8.70 15.3 20.5 ## 8 7.00 17.0 0.0450 ## 9 10.1 13.9 14.0 ## 10 3.00 21.0 14.8 ## # ... with 73 more rows  select_if的另一个有用功能是n_distinct（），它计算可以在列中找到的不同值的数量。例如，要返回少于10个不同答案的列，请在select_if语句中传递~n_distinct（。）<10。 鉴于n_distinct（。）<10不是函数，你需要在前面放一个波浪号。 msleep %>% select_if(~n_distinct(.) < 10) ## # A tibble: 83 x 2 ## vore conservation ## <chr> <chr> ## 1 carni lc ## 2 omni <NA> ## 3 herbi nt ## 4 omni lc ## 5 herbi domesticated ## 6 herbi <NA> ## 7 carni vu ## 8 <NA> <NA> ## 9 carni domesticated ## 10 herbi lc ## # ... with 73 more rows  对列重新排序 您可以使用select（）函数（见下文）重新排序列。 您选择它们的顺序将决定最终的顺序。 msleep %>% select(conservation, sleep_total, name) %>% glimpse ## Observations: 83 ## Variables: 3 ##$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", N...
## $sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1... ##$ name         <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Grea...



msleep %>%
select(conservation, sleep_total, everything()) %>%
glimpse

## Observations: 83
## Variables: 11
## $conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", N... ##$ sleep_total  <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1...
## $name <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Grea... ##$ genus        <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bo...
## $vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi... ##$ order        <chr> "Carnivora", "Primates", "Rodentia", "Soricomorph...
## $sleep_rem <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.... ##$ sleep_cycle  <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.38...
## $awake <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9,... ##$ brainwt      <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0....
## $bodywt <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.4...  列名 有时候列名称本身需要进行更改： 重命名列 如果您将使用select（）语句，则可以在select函数中直接重命名。 msleep %>% select(animal = name, sleep_total, extinction_threat = conservation) %>% glimpse ## Observations: 83 ## Variables: 3 ##$ animal            <chr> "Cheetah", "Owl monkey", "Mountain beaver", ...
## $sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0,... ##$ extinction_threat <chr> "lc", NA, "nt", "lc", "domesticated", NA, "v...



msleep %>%
rename(animal = name, extinction_threat = conservation) %>%
glimpse

## Observations: 83
## Variables: 11
## $animal <chr> "Cheetah", "Owl monkey", "Mountain beaver", ... ##$ genus             <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina"...
## $vore <chr> "carni", "omni", "herbi", "omni", "herbi", "... ##$ order             <chr> "Carnivora", "Primates", "Rodentia", "Sorico...
## $extinction_threat <chr> "lc", NA, "nt", "lc", "domesticated", NA, "v... ##$ sleep_total       <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0,...
## $sleep_rem <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, N... ##$ sleep_cycle       <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667,...
## $awake <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, ... ##$ brainwt           <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, N...
## \$ bodywt            <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850,...



格式化所有列名

select_all（）函数允许更改所有列，并将函数作为参数。如果想以大写形式获取所有列名，可以使用toupper（），同样可以使用小写tolower（）

msleep %>%
select_all(toupper)

## # A tibble: 83 x 11
##    NAME   GENUS VORE  ORDER CONSERVATION SLEEP_TOTAL SLEEP_REM SLEEP_CYCLE
##    <chr>  <chr> <chr> <chr> <chr>              <dbl>     <dbl>       <dbl>
##  1 Cheet~ Acin~ carni Carn~ lc                 12.1     NA          NA
##  2 Owl m~ Aotus omni  Prim~ <NA>               17.0      1.80       NA
##  3 Mount~ Aplo~ herbi Rode~ nt                 14.4      2.40       NA
##  4 Great~ Blar~ omni  Sori~ lc                 14.9      2.30        0.133
##  5 Cow    Bos   herbi Arti~ domesticated        4.00     0.700       0.667
##  6 Three~ Brad~ herbi Pilo~ <NA>               14.4      2.20        0.767
##  7 North~ Call~ carni Carn~ vu                  8.70     1.40        0.383
##  8 Vespe~ Calo~ <NA>  Rode~ <NA>                7.00    NA          NA
##  9 Dog    Canis carni Carn~ domesticated       10.1      2.90        0.333
## 10 Roe d~ Capr~ herbi Arti~ lc                  3.00    NA          NA
## # ... with 73 more rows, and 3 more variables: AWAKE <dbl>, BRAINWT <dbl>,
## #   BODYWT <dbl>



#making an unclean database:
msleep2 <- select(msleep, name, sleep_total, brainwt)
colnames(msleep2) <- c("name", "sleep total", "brain weight")

msleep2 %>%
select_all(~str_replace(., " ", "_"))

## # A tibble: 83 x 3
##    name                       sleep_total brain_weight
##    <chr>                            <dbl>        <dbl>
##  1 Cheetah                          12.1     NA
##  2 Owl monkey                       17.0      0.0155
##  3 Mountain beaver                  14.4     NA
##  4 Greater short-tailed shrew       14.9      0.000290
##  5 Cow                               4.00     0.423
##  6 Three-toed sloth                 14.4     NA
##  7 Northern fur seal                 8.70    NA
##  8 Vesper mouse                      7.00    NA
##  9 Dog                              10.1      0.0700
## 10 Roe deer                          3.00     0.0982
## # ... with 73 more rows



#making an unclean database:
msleep2 <- select(msleep, name, sleep_total, brainwt)
colnames(msleep2) <- c("Q1 name", "Q2 sleep total", "Q3 brain weight")
msleep2[1:3,]

## # A tibble: 3 x 3
##   Q1 name       Q2 sleep total Q3 brain weight
##   <chr>                      <dbl>             <dbl>
## 1 Cheetah                     12.1           NA
## 2 Owl monkey                  17.0            0.0155
## 3 Mountain beaver             14.4           NA



msleep2 %>%
select_all(~str_replace(., "Q[0-9]+", "")) %>%
select_all(~str_replace(., " ", "_"))

## # A tibble: 83 x 3
##    _name                    _sleep total _brain weight
##    <chr>                               <dbl>           <dbl>
##  1 Cheetah                             12.1        NA
##  2 Owl monkey                          17.0         0.0155
##  3 Mountain beaver                     14.4        NA
##  4 Greater short-tailed shrew          14.9         0.000290
##  5 Cow                                  4.00        0.423
##  6 Three-toed sloth                    14.4        NA
##  7 Northern fur seal                    8.70       NA
##  8 Vesper mouse                         7.00       NA
##  9 Dog                                 10.1         0.0700
## 10 Roe deer                             3.00        0.0982
## # ... with 73 more rows



行名转换成列

 mtcars %>%

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1



 mtcars %>%
tibble::rownames_to_column("car_model") %>%

##           car_model  mpg cyl disp  hp drat    wt  qsec vs am gear carb
## 1         Mazda RX4 21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## 2     Mazda RX4 Wag 21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## 3        Datsun 710 22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## 4    Hornet 4 Drive 21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## 5 Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## 6           Valiant 18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
`

|
4月前
|

25 0
|
4月前

23 0
|
SQL 移动开发 BI
【SQL开发实战技巧】系列（二十三）：数仓报表场景☞ 如何对数据排列组合去重以及通过如何找到包含最大值和最小值的记录这个问题再次用执行计划给你证明分析函数性能不一定高

207 0
|

pandas数据处理高级系列001-如何用一行代码优雅的删除一行数据中不包含特定字符串的行
pandas数据处理高级系列001-如何用一行代码优雅的删除一行数据中不包含特定字符串的行
174 0
|
SQL BI 数据库
|
SQL 数据可视化 BI
PowerBI基础知识——度量值
SQL数据库开发
1083 0
|
SQL 数据可视化 数据库
|

145 0

103 0
|
Python
Python编程语言学习：判断两个列表是否对应完全相等(巧解输出是一摸一样的列表数据，但就是不相等)
Python编程语言学习：判断两个列表是否对应完全相等(巧解输出是一摸一样的列表数据，但就是不相等)
307 0