Cleaning data

A common step during data analysis is to clean up the raw data. We fix any obvious errors, edit column names, exclude rows we do not want, and save the cleaned up data set. We do the analysis on the cleaned data set.

Load libraries

library(readr) # read and write tabular data
library(dplyr) # manipulate data
library(pdftools) # process pdfs
library(stringr) # work with string
library(here) # file paths

iNaturalist

Cleaning up iNaturalist file

Save the file paths as objects.

observations_raw_path <- tempfile()
zip_path <- here('data/raw/observations-513664.csv.zip')
unzip(zipfile = zip_path, exdir = observations_raw_path)

Warning in unzip(zipfile = zip_path, exdir = observations_raw_path): error 1 in
extracting from zip file

observations_cleaned_path <- here('data/cleaned/cnc-los-angeles-observations.csv')

The CSV we downloaded from iNaturalist had several problems.

some observation did not have a species
some observations did not have location information
some observations were not in Los Angeles
there were some fields that we did not need
information about observations with threatened, native, and introduced species had to be downloaded separately

In order to simplify things for the workshop attendees, we fixed the various issues and created a cleaned up CSV.

These are the steps we took to produced the cleaned up file. The raw file is at ‘/var/folders/zw/0kb57jnn5nv9k3pgxsz86n7h0000gq/T//Rtmpel1mxP/file5e7c583e3ff2’, the cleaned file is at ‘/Users/wyk/Development/science/city_nature_challenge/NHMLA_workshop/after_inaturalist_lesson/data/cleaned/cnc-los-angeles-observations.csv’

tmp <- tempfile()
zip_path <- here('data/raw/observations-513664.zip')
unzip(zipfile = zip_path, exdir = tmp)

inat_raw <- read_csv(paste(tmp, 'observations-513664.csv', sep='/'))

Use colSums(is.na()) to count the number of rows that have NA values for each column.

colSums(is.na(inat_raw))

                              id                             uuid 
                               0                                0 
              observed_on_string                      observed_on 
                               0                                0 
                time_observed_at                        time_zone 
                            5862                                0 
                         user_id                       user_login 
                               0                                0 
                       user_name                       created_at 
                           72542                                0 
                      updated_at                    quality_grade 
                               0                                0 
                         license                              url 
                           48002                                0 
                       image_url                        sound_url 
                            2530                           192990 
                        tag_list                      description 
                          186884                           169938 
   num_identification_agreements num_identification_disagreements 
                               0                                0 
              captive_cultivated             oauth_application_id 
                               0                            71706 
                     place_guess                         latitude 
                             440                              438 
                       longitude              positional_accuracy 
                             438                            40407 
             private_place_guess                 private_latitude 
                          193738                           193738 
               private_longitude       public_positional_accuracy 
                          193738                            36853 
                      geoprivacy                 taxon_geoprivacy 
                          180034                           144575 
            coordinates_obscured               positioning_method 
                               0                           176278 
              positioning_device                    species_guess 
                          174658                            29909 
                 scientific_name                      common_name 
                            1671                            12223 
               iconic_taxon_name                         taxon_id 
                            1862                             1670 
              taxon_kingdom_name                taxon_phylum_name 
                            1846                             3906 
            taxon_subphylum_name            taxon_superclass_name 
                           15073                           191497 
                taxon_class_name              taxon_subclass_name 
                            6165                           142186 
           taxon_superorder_name                 taxon_order_name 
                          183410                            10661 
             taxon_suborder_name           taxon_superfamily_name 
                          148094                           153153 
               taxon_family_name             taxon_subfamily_name 
                           16037                            79392 
           taxon_supertribe_name                 taxon_tribe_name 
                          190875                           105525 
             taxon_subtribe_name                 taxon_genus_name 
                          168941                            28054 
          taxon_genushybrid_name               taxon_species_name 
                          193738                            55501 
               taxon_hybrid_name            taxon_subspecies_name 
                          193139                           190760 
              taxon_variety_name                  taxon_form_name 
                          192195                           193738

All rows have id, observed_on, and user_id.

some rows don’t have scientific_name, latitude or longitude.

Use filter to select the observations we want.

!is.na will select rows that have are not NA, meaning rows that have a value. We select observations that have latitude, longitude, and scientific_name. We also ignored observations outside of LA

We save the cleaned up data in a new object inat.

inat <- inat_raw %>% 
  filter(!is.na(latitude) &
           !is.na(longitude) &
           !is.na(scientific_name)) %>%
    filter(latitude < 40)

The original data frame ‘inat_raw’ had 193K rows, the cleaned data frame ‘inat’ has 191K rows.

We can double check our work.

latitude, longitude, scientific_name have zero NA.

colSums(is.na(inat))

                              id                             uuid 
                               0                                0 
              observed_on_string                      observed_on 
                               0                                0 
                time_observed_at                        time_zone 
                            5819                                0 
                         user_id                       user_login 
                               0                                0 
                       user_name                       created_at 
                           71186                                0 
                      updated_at                    quality_grade 
                               0                                0 
                         license                              url 
                           47396                                0 
                       image_url                        sound_url 
                            2243                           190893 
                        tag_list                      description 
                          184803                           168134 
   num_identification_agreements num_identification_disagreements 
                               0                                0 
              captive_cultivated             oauth_application_id 
                               0                            71018 
                     place_guess                         latitude 
                               1                                0 
                       longitude              positional_accuracy 
                               0                            40057 
             private_place_guess                 private_latitude 
                          191638                           191638 
               private_longitude       public_positional_accuracy 
                          191638                            36540 
                      geoprivacy                 taxon_geoprivacy 
                          178390                           142538 
            coordinates_obscured               positioning_method 
                               0                           174275 
              positioning_device                    species_guess 
                          172680                            28434 
                 scientific_name                      common_name 
                               0                            10525 
               iconic_taxon_name                         taxon_id 
                             190                                0 
              taxon_kingdom_name                taxon_phylum_name 
                             174                             2221 
            taxon_subphylum_name            taxon_superclass_name 
                           13372                           189405 
                taxon_class_name              taxon_subclass_name 
                            4465                           140237 
           taxon_superorder_name                 taxon_order_name 
                          181350                             8944 
             taxon_suborder_name           taxon_superfamily_name 
                          146151                           151144 
               taxon_family_name             taxon_subfamily_name 
                           14269                            77518 
           taxon_supertribe_name                 taxon_tribe_name 
                          188777                           103590 
             taxon_subtribe_name                 taxon_genus_name 
                          166888                            26258 
          taxon_genushybrid_name               taxon_species_name 
                          191638                            53654 
               taxon_hybrid_name            taxon_subspecies_name 
                          191043                           188661 
              taxon_variety_name                  taxon_form_name 
                          190096                           191638

We want to delete unneeded columns to keep the CSV under 100 MB for Github

names(inat)

 [1] "id"                               "uuid"                            
 [3] "observed_on_string"               "observed_on"                     
 [5] "time_observed_at"                 "time_zone"                       
 [7] "user_id"                          "user_login"                      
 [9] "user_name"                        "created_at"                      
[11] "updated_at"                       "quality_grade"                   
[13] "license"                          "url"                             
[15] "image_url"                        "sound_url"                       
[17] "tag_list"                         "description"                     
[19] "num_identification_agreements"    "num_identification_disagreements"
[21] "captive_cultivated"               "oauth_application_id"            
[23] "place_guess"                      "latitude"                        
[25] "longitude"                        "positional_accuracy"             
[27] "private_place_guess"              "private_latitude"                
[29] "private_longitude"                "public_positional_accuracy"      
[31] "geoprivacy"                       "taxon_geoprivacy"                
[33] "coordinates_obscured"             "positioning_method"              
[35] "positioning_device"               "species_guess"                   
[37] "scientific_name"                  "common_name"                     
[39] "iconic_taxon_name"                "taxon_id"                        
[41] "taxon_kingdom_name"               "taxon_phylum_name"               
[43] "taxon_subphylum_name"             "taxon_superclass_name"           
[45] "taxon_class_name"                 "taxon_subclass_name"             
[47] "taxon_superorder_name"            "taxon_order_name"                
[49] "taxon_suborder_name"              "taxon_superfamily_name"          
[51] "taxon_family_name"                "taxon_subfamily_name"            
[53] "taxon_supertribe_name"            "taxon_tribe_name"                
[55] "taxon_subtribe_name"              "taxon_genus_name"                
[57] "taxon_genushybrid_name"           "taxon_species_name"              
[59] "taxon_hybrid_name"                "taxon_subspecies_name"           
[61] "taxon_variety_name"               "taxon_form_name"

Setting a column to NULL will remove the column.

inat$uuid <- NULL
inat$observed_on_string <- NULL
inat$time_zone <- NULL
inat$num_identification_agreements <- NULL
inat$num_identification_disagreements <- NULL
inat$oauth_application_id <- NULL
inat$place_guess <- NULL
inat$private_place_guess <- NULL
inat$private_longitude <- NULL
inat$private_latitude <- NULL
inat$positioning_method <- NULL
inat$positioning_device <- NULL

use filter and dim to see how often the taxon names were used.

inat %>% filter(!is.na(taxon_subphylum_name)) %>% dim

[1] 178266     50

inat %>% filter(!is.na(taxon_superclass_name)) %>% dim

[1] 2233   50

inat %>% filter(!is.na(taxon_subclass_name)) %>% dim

[1] 51401    50

inat %>% filter(!is.na(taxon_superorder_name)) %>% dim

[1] 10288    50

inat %>% filter(!is.na(taxon_suborder_name)) %>% dim

[1] 45487    50

inat %>% filter(!is.na(taxon_superfamily_name)) %>% dim

[1] 40494    50

inat %>% filter(!is.na(taxon_subfamily_name)) %>% dim

[1] 114120     50

inat %>% filter(!is.na(taxon_supertribe_name)) %>% dim

[1] 2861   50

inat %>% filter(!is.na(taxon_tribe_name)) %>% dim

[1] 88048    50

inat %>% filter(!is.na(taxon_subtribe_name)) %>% dim

[1] 24750    50

inat %>% filter(!is.na(taxon_genushybrid_name)) %>% dim

[1]  0 50

inat %>% filter(!is.na(taxon_hybrid_name)) %>% dim

[1] 595  50

inat %>% filter(!is.na(taxon_subspecies_name)) %>% dim

[1] 2977   50

inat %>% filter(!is.na(taxon_variety_name)) %>% dim

[1] 1542   50

inat %>% filter(!is.na(taxon_form_name)) %>% dim

[1]  0 50

inat$species_guess <- NULL

inat$taxon_subphylum_name <- NULL

inat$taxon_superclass_name <- NULL
inat$taxon_subclass_name <- NULL

inat$taxon_superorder_name <- NULL
inat$taxon_suborder_name <- NULL

inat$taxon_superfamily_name <- NULL
inat$taxon_subfamily_name <- NULL

inat$taxon_supertribe_name <- NULL
inat$taxon_tribe_name <- NULL
inat$taxon_subtribe_name <- NULL

inat$taxon_genushybrid_name <- NULL
inat$taxon_hybrid_name <- NULL

inat$taxon_variety_name <- NULL
inat$taxon_form_name <- NULL

names(inat)

 [1] "id"                         "observed_on"               
 [3] "time_observed_at"           "user_id"                   
 [5] "user_login"                 "user_name"                 
 [7] "created_at"                 "updated_at"                
 [9] "quality_grade"              "license"                   
[11] "url"                        "image_url"                 
[13] "sound_url"                  "tag_list"                  
[15] "description"                "captive_cultivated"        
[17] "latitude"                   "longitude"                 
[19] "positional_accuracy"        "public_positional_accuracy"
[21] "geoprivacy"                 "taxon_geoprivacy"          
[23] "coordinates_obscured"       "scientific_name"           
[25] "common_name"                "iconic_taxon_name"         
[27] "taxon_id"                   "taxon_kingdom_name"        
[29] "taxon_phylum_name"          "taxon_class_name"          
[31] "taxon_order_name"           "taxon_family_name"         
[33] "taxon_genus_name"           "taxon_species_name"        
[35] "taxon_subspecies_name"

We want to save the cleaned up data set so we can use it later. We can save data.frame to a CSV using the write_csv() function from the readr package. The first argument is the name of the data.frame, and the second is the path to the new file we want to create.

write_csv(inat, file= observations_cleaned_path, na = "" )

Adding more information

In order to get information about observations with threatened, introduced, and native species, we had to download a separate CSV for each option. We want to merge all these files into one cleaned observations file.

inat_data <- read_csv(observations_cleaned_path)
threatened_raw <- read_csv(here('data/raw/observations-514065_threatened.csv'))
introduced_raw <- read_csv(here('data/raw/observations-514069_introduced.csv'))
native_raw <- read_csv(here('data/raw/observations-514076_native.csv'))

Get threatened species.

threatened_data <- threatened_raw %>% 
  mutate(threatened = TRUE)

threatened_data <- threatened_data[!duplicated(threatened_data), ]

threatened_data

# A tibble: 337 × 2
   taxon_id threatened
      <dbl> <lgl>     
 1    48662 TRUE      
 2  1454382 TRUE      
 3    59575 TRUE      
 4   210624 TRUE      
 5   221090 TRUE      
 6    80308 TRUE      
 7   144205 TRUE      
 8    78804 TRUE      
 9    61164 TRUE      
10    77658 TRUE      
# ℹ 327 more rows

Get introduced species.

introduced_data <- introduced_raw %>% 
  mutate(establishment_means = 'introduced') 
  
introduced_data <- introduced_data[!duplicated(introduced_data), ]


introduced_data

# A tibble: 1,440 × 2
   taxon_id establishment_means
      <dbl> <chr>              
 1   480298 introduced         
 2    13858 introduced         
 3   187316 introduced         
 4    51271 introduced         
 5    46020 introduced         
 6   236935 introduced         
 7   552449 introduced         
 8    53168 introduced         
 9   162972 introduced         
10    53025 introduced         
# ℹ 1,430 more rows

Get native species.

native_data <- native_raw %>% 
  mutate(establishment_means = 'native') 
  
native_data <- native_data[!duplicated(native_data), ]


native_data

# A tibble: 3,058 × 2
   taxon_id establishment_means
      <dbl> <chr>              
 1    54247 native             
 2    36100 native             
 3    36204 native             
 4     6930 native             
 5     7470 native             
 6    43115 native             
 7    48662 native             
 8     7089 native             
 9    50822 native             
10    29053 native             
# ℹ 3,048 more rows

Get combine introduced and native data frames

establishment_means <- rbind(introduced_data, native_data)

establishment_means

# A tibble: 4,498 × 2
   taxon_id establishment_means
      <dbl> <chr>              
 1   480298 introduced         
 2    13858 introduced         
 3   187316 introduced         
 4    51271 introduced         
 5    46020 introduced         
 6   236935 introduced         
 7   552449 introduced         
 8    53168 introduced         
 9   162972 introduced         
10    53025 introduced         
# ℹ 4,488 more rows

remove rows where taxon_id is listed as both native and introduced

non_duplicates <- establishment_means %>%
  count(taxon_id) %>%
  arrange(desc(n)) %>%
  filter(n == 1)  
  
keep_ids <- non_duplicates$taxon_id

length(keep_ids)

[1] 4436

dedup_establishment_means <- establishment_means %>%
  filter(taxon_id %in% keep_ids)

add threatened info to the main inat data frame.

combined_data <- left_join(inat_data, threatened_data)

Joining with `by = join_by(taxon_id, threatened)`

add native, introduced info to main inat data frame

combined_data <- left_join(combined_data, dedup_establishment_means)

Joining with `by = join_by(taxon_id, establishment_means)`

save file

write_csv(combined_data, observations_cleaned_path, na = "")

LASAN

Indicator species

Convert pdf to CSV

pdf <- pdf_text(here("data/raw/LASAN/cnt061142_indicator_species.pdf")) %>%
  readr::read_lines()

my_list = list()
for (line in pdf[5:189]) {
  if (line != '') {
    matches <- str_match(line, 
        "^([a-zA-Z]+)  +([a-zA-Z]+ [a-zA-Z.()]+ ([a-zA-Z.()]+ [a-zA-Z.()]+)?)  +([-'a-zA-Z ]+)  +"  
    )
    row <- c(matches[1,2], matches[1,3],  matches[1,5], 'species') %>% str_squish()
    my_list[[length(my_list)+1]] <- row
  }
}

stats_df <- plyr::ldply(my_list) 
colnames(stats_df) <- c('group', 'scientific name', 'common name', 'taxon rank')

stats_df <- stats_df %>% filter_all(any_vars(!is.na(.)))
stats_df <- stats_df %>% filter(group != 'Group')

stats_df[2,'common name'] <- paste(stats_df[2,'common name'], 'Salamander')
stats_df[18,'common name'] <- paste(stats_df[18,'common name'], 'Jerusalem crickets')
stats_df[22,'common name'] <- paste(stats_df[22,'common name'], 'hairstreak')
stats_df[23,'common name'] <- paste(stats_df[23,'common name'], 'butterfly')
stats_df[23,'taxon rank'] <- 'subspecies'
stats_df[25,'scientific name'] <- 'Mutillidae'
stats_df[25,'taxon rank'] <- 'family'
stats_df[26,'scientific name'] <- 'Pogonomyrmex'
stats_df[26,'taxon rank'] <- 'genus'

write_csv(stats_df, here('data/cleaned/LA_city_indicator_species.csv'))