Subsetting Data

ISI-BUDS 2023

glimpse(ca_covid_hosp)
Rows: 47,205
Columns: 9
$ county                                <chr> "Placer", "Kern", "Madera", "Men…
$ todays_date                           <date> 2020-03-29, 2020-03-29, 2020-03…
$ hospitalized_covid_confirmed_patients <dbl> 18, 16, 1, 1, 0, 20, 1, 46, 121,…
$ hospitalized_suspected_covid_patients <dbl> 14, 57, 6, 11, 0, 48, 2, 27, 211…
$ hospitalized_covid_patients           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, …
$ all_hospital_beds                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, …
$ icu_covid_confirmed_patients          <dbl> 4, 8, 1, 0, 0, 9, 0, 16, 77, 2, …
$ icu_suspected_covid_patients          <dbl> 1, 8, 1, 0, 0, 8, 0, 10, 39, 5, …
$ icu_available_beds                    <dbl> 10, 39, 8, 1, 2, 25, 3, 28, 205,…

subsetting variables/columns

select()

subsetting observations/rows

slice() and filter()

select is used to select certain variables in the data frame.

select(ca_covid_hosp, county, todays_date,
       hospitalized_covid_confirmed_patients)
# A tibble: 47,205 × 3
   county     todays_date hospitalized_covid_confirmed_patients
   <chr>      <date>                                      <dbl>
 1 Placer     2020-03-29                                     18
 2 Kern       2020-03-29                                     16
 3 Madera     2020-03-29                                      1
 4 Mendocino  2020-03-29                                      1
 5 Tehama     2020-03-29                                      0
 6 Stanislaus 2020-03-29                                     20
 7 Del Norte  2020-03-29                                      1
 8 San Mateo  2020-03-29                                     46
 9 San Diego  2020-03-29                                    121
10 Sonoma     2020-03-29                                      7
# ℹ 47,195 more rows

ca_covid_hosp %>% 
  select(county, todays_date,
       hospitalized_covid_confirmed_patients)
# A tibble: 47,205 × 3
   county     todays_date hospitalized_covid_confirmed_patients
   <chr>      <date>                                      <dbl>
 1 Placer     2020-03-29                                     18
 2 Kern       2020-03-29                                     16
 3 Madera     2020-03-29                                      1
 4 Mendocino  2020-03-29                                      1
 5 Tehama     2020-03-29                                      0
 6 Stanislaus 2020-03-29                                     20
 7 Del Norte  2020-03-29                                      1
 8 San Mateo  2020-03-29                                     46
 9 San Diego  2020-03-29                                    121
10 Sonoma     2020-03-29                                      7
# ℹ 47,195 more rows

select can also be used to drop certain variables if used with a negative sign.

select(ca_covid_hosp, -all_hospital_beds,
       -icu_available_beds)
# A tibble: 47,205 × 7
   county     todays_date hospitalized_covid_confirmed_…¹ hospitalized_suspect…²
   <chr>      <date>                                <dbl>                  <dbl>
 1 Placer     2020-03-29                               18                     14
 2 Kern       2020-03-29                               16                     57
 3 Madera     2020-03-29                                1                      6
 4 Mendocino  2020-03-29                                1                     11
 5 Tehama     2020-03-29                                0                      0
 6 Stanislaus 2020-03-29                               20                     48
 7 Del Norte  2020-03-29                                1                      2
 8 San Mateo  2020-03-29                               46                     27
 9 San Diego  2020-03-29                              121                    211
10 Sonoma     2020-03-29                                7                     20
# ℹ 47,195 more rows
# ℹ abbreviated names: ¹​hospitalized_covid_confirmed_patients,
#   ²​hospitalized_suspected_covid_patients
# ℹ 3 more variables: hospitalized_covid_patients <dbl>,
#   icu_covid_confirmed_patients <dbl>, icu_suspected_covid_patients <dbl>

Selection helpers

starts_with()
ends_with()
contains()

select(ca_covid_hosp, starts_with("hospitalized"))
# A tibble: 47,205 × 3
   hospitalized_covid_confirmed_…¹ hospitalized_suspect…² hospitalized_covid_p…³
                             <dbl>                  <dbl>                  <dbl>
 1                              18                     14                     NA
 2                              16                     57                     NA
 3                               1                      6                     NA
 4                               1                     11                     NA
 5                               0                      0                     NA
 6                              20                     48                     NA
 7                               1                      2                     NA
 8                              46                     27                     NA
 9                             121                    211                     NA
10                               7                     20                     NA
# ℹ 47,195 more rows
# ℹ abbreviated names: ¹​hospitalized_covid_confirmed_patients,
#   ²​hospitalized_suspected_covid_patients, ³​hospitalized_covid_patients

select(ca_covid_hosp, ends_with("covid_confirmed_patients"))
# A tibble: 47,205 × 2
   hospitalized_covid_confirmed_patients icu_covid_confirmed_patients
                                   <dbl>                        <dbl>
 1                                    18                            4
 2                                    16                            8
 3                                     1                            1
 4                                     1                            0
 5                                     0                            0
 6                                    20                            9
 7                                     1                            0
 8                                    46                           16
 9                                   121                           77
10                                     7                            2
# ℹ 47,195 more rows

select(ca_covid_hosp, contains("suspected"))
# A tibble: 47,205 × 2
   hospitalized_suspected_covid_patients icu_suspected_covid_patients
                                   <dbl>                        <dbl>
 1                                    14                            1
 2                                    57                            8
 3                                     6                            1
 4                                    11                            0
 5                                     0                            0
 6                                    48                            8
 7                                     2                            0
 8                                    27                           10
 9                                   211                           39
10                                    20                            5
# ℹ 47,195 more rows

subsetting variables/columns

select()

subsetting observations/rows

slice() and filter()

slice() subsetting rows based on a row number.

The data below include all the rows from third to seventh. Including third and seventh.

slice(ca_covid_hosp, 3:7)
# A tibble: 5 × 9
  county     todays_date hospitalized_covid_confirmed_p…¹ hospitalized_suspect…²
  <chr>      <date>                                 <dbl>                  <dbl>
1 Madera     2020-03-29                                 1                      6
2 Mendocino  2020-03-29                                 1                     11
3 Tehama     2020-03-29                                 0                      0
4 Stanislaus 2020-03-29                                20                     48
5 Del Norte  2020-03-29                                 1                      2
# ℹ abbreviated names: ¹​hospitalized_covid_confirmed_patients,
#   ²​hospitalized_suspected_covid_patients
# ℹ 5 more variables: hospitalized_covid_patients <dbl>,
#   all_hospital_beds <dbl>, icu_covid_confirmed_patients <dbl>,
#   icu_suspected_covid_patients <dbl>, icu_available_beds <dbl>

filter() subsetting rows based on a condition.

The data below includes rows corresponding to Orange county.

filter(ca_covid_hosp, county == "Orange")
# A tibble: 843 × 9
   county todays_date hospitalized_covid_confirmed_pati…¹ hospitalized_suspect…²
   <chr>  <date>                                    <dbl>                  <dbl>
 1 Orange 2020-03-29                                   74                    167
 2 Orange 2020-03-30                                   98                    251
 3 Orange 2020-03-31                                  105                    233
 4 Orange 2020-04-01                                  117                    221
 5 Orange 2020-04-02                                  129                    225
 6 Orange 2020-04-03                                  131                    182
 7 Orange 2020-04-04                                  143                    213
 8 Orange 2020-04-05                                  144                    221
 9 Orange 2020-04-06                                  145                    171
10 Orange 2020-04-07                                  143                    144
# ℹ 833 more rows
# ℹ abbreviated names: ¹​hospitalized_covid_confirmed_patients,
#   ²​hospitalized_suspected_covid_patients
# ℹ 5 more variables: hospitalized_covid_patients <dbl>,
#   all_hospital_beds <dbl>, icu_covid_confirmed_patients <dbl>,
#   icu_suspected_covid_patients <dbl>, icu_available_beds <dbl>

Relational Operators in R

Operator Description
< Less than
> Greater than
<= Less than or equal to
>= Greater than or equal to
== Equal to
!= Not equal to

Logical Operators in R

Operator Description
& and
| or

Q. On which days the total number of hospitalized confirmed COVID patients in Orange county exceeded 2000?

ca_covid_hosp %>% 
  filter(county == "Orange" & 
           hospitalized_covid_confirmed_patients > 2200)
# A tibble: 7 × 9
  county todays_date hospitalized_covid_confirmed_patie…¹ hospitalized_suspect…²
  <chr>  <date>                                     <dbl>                  <dbl>
1 Orange 2021-01-04                                  2236                     42
2 Orange 2021-01-05                                  2250                     42
3 Orange 2021-01-06                                  2253                     53
4 Orange 2021-01-07                                  2259                     49
5 Orange 2021-01-08                                  2221                     39
6 Orange 2021-01-09                                  2216                     34
7 Orange 2021-01-10                                  2221                     56
# ℹ abbreviated names: ¹​hospitalized_covid_confirmed_patients,
#   ²​hospitalized_suspected_covid_patients
# ℹ 5 more variables: hospitalized_covid_patients <dbl>,
#   all_hospital_beds <dbl>, icu_covid_confirmed_patients <dbl>,
#   icu_suspected_covid_patients <dbl>, icu_available_beds <dbl>

ca_covid_hosp %>% 
  filter(county=="Orange" & 
           hospitalized_covid_confirmed_patients>2200) %>% 
  nrow()
[1] 7

Q. On which days the total number of hospitalized confirmed COVID patients in Orange county exceeded 2000 and the total number of confirmed COVID patients in ICU exceeded 500?

ca_covid_hosp %>% 
  filter(county == "Orange" & 
           hospitalized_covid_confirmed_patients > 2200 &
           icu_covid_confirmed_patients >500)
# A tibble: 6 × 9
  county todays_date hospitalized_covid_confirmed_patie…¹ hospitalized_suspect…²
  <chr>  <date>                                     <dbl>                  <dbl>
1 Orange 2021-01-04                                  2236                     42
2 Orange 2021-01-06                                  2253                     53
3 Orange 2021-01-07                                  2259                     49
4 Orange 2021-01-08                                  2221                     39
5 Orange 2021-01-09                                  2216                     34
6 Orange 2021-01-10                                  2221                     56
# ℹ abbreviated names: ¹​hospitalized_covid_confirmed_patients,
#   ²​hospitalized_suspected_covid_patients
# ℹ 5 more variables: hospitalized_covid_patients <dbl>,
#   all_hospital_beds <dbl>, icu_covid_confirmed_patients <dbl>,
#   icu_suspected_covid_patients <dbl>, icu_available_beds <dbl>

We have done all sorts of selections, slicing, filtering on ca_covid_hosp but it has not changed at all. Why do you think so?

glimpse(ca_covid_hosp)
Rows: 47,205
Columns: 9
$ county                                <chr> "Placer", "Kern", "Madera", "Men…
$ todays_date                           <date> 2020-03-29, 2020-03-29, 2020-03…
$ hospitalized_covid_confirmed_patients <dbl> 18, 16, 1, 1, 0, 20, 1, 46, 121,…
$ hospitalized_suspected_covid_patients <dbl> 14, 57, 6, 11, 0, 48, 2, 27, 211…
$ hospitalized_covid_patients           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, …
$ all_hospital_beds                     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, …
$ icu_covid_confirmed_patients          <dbl> 4, 8, 1, 0, 0, 9, 0, 16, 77, 2, …
$ icu_suspected_covid_patients          <dbl> 1, 8, 1, 0, 0, 8, 0, 10, 39, 5, …
$ icu_available_beds                    <dbl> 10, 39, 8, 1, 2, 25, 3, 28, 205,…