Below is what we have done to clean original data in R:
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.3 √ purrr 0.3.4
## √ tibble 3.0.5 √ dplyr 1.0.3
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.4.0 √ forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
table_1_edit <-
read_excel(here::here("dataset/table-1_edit.xls"), range = "A4:V24") #("Specify cells for reading")
table_1_edit[2, 1] = '2001' # ("Extract data frame cell value")
table_1_edit[19, 1] = '2018' # ("Extract data frame cell value")
names(table_1_edit)[c(2,3,7,8,9,10)] <-
(str_sub(names(table_1_edit)[c(2,3,7,8,9,10)], 1, nchar(names(table_1_edit)[c(2,3,7,8,9,10)])-1)) # ("Rename Data Frame Columns in R") (“Count the Number of Characters (or Bytes or Width)")
names(table_1_edit)[1:22] <-
c("year", "population", "violent_crime", "violent_crime_rate", "murder_and_nonnegligent_manslaughter", "murder_and_nonnegligent_manslaughter_rate", "rape_revised_definition", "rape_revised_definition_rate", "rape_legacy_definition", "rape_legacy_definition_rate", "robbery", "robbery_rate", "aggravated_assault", "aggravated_assault_rate", "property_crime", "property_crime_rate", "burglary", "burglary_rate", "larceny_theft", "larceny_theft_rate", "motor_vehicle_theft", "motor_vehicle_theft_rate")
(table_1_edit)
## # A tibble: 20 x 22
## year population violent_crime violent_crime_r~ murder_and_nonn~
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2000 281421906 1425486 506. 15586
## 2 2001 285317559 1439480 504. 16037
## 3 2002 287973924 1423677 494. 16229
## 4 2003 290788976 1383676 476. 16528
## 5 2004 293656842 1360088 463. 16148
## 6 2005 296507061 1390745 469 16740
## 7 2006 299398484 1435123 479. 17309
## 8 2007 301621157 1422970 472. 17128
## 9 2008 304059724 1394461 459. 16465
## 10 2009 307006550 1325896 432. 15399
## 11 2010 309330219 1251248 404. 14722
## 12 2011 311587816 1206005 387. 14661
## 13 2012 313873685 1217057 388. 14856
## 14 2013 316497531 1168298 369. 14319
## 15 2014 318907401 1153022 362. 14164
## 16 2015 320896618 1199310 374. 15883
## 17 2016 323405935 1250162 387. 17413
## 18 2017 325147121 1247917 384. 17294
## 19 2018 326687501 1209997 370. 16374
## 20 2019 328239523 1203808 367. 16425
## # ... with 17 more variables: murder_and_nonnegligent_manslaughter_rate <dbl>,
## # rape_revised_definition <dbl>, rape_revised_definition_rate <dbl>,
## # rape_legacy_definition <dbl>, rape_legacy_definition_rate <dbl>,
## # robbery <dbl>, robbery_rate <dbl>, aggravated_assault <dbl>,
## # aggravated_assault_rate <dbl>, property_crime <dbl>,
## # property_crime_rate <dbl>, burglary <dbl>, burglary_rate <dbl>,
## # larceny_theft <dbl>, larceny_theft_rate <dbl>, motor_vehicle_theft <dbl>,
## # motor_vehicle_theft_rate <dbl>
unemployment_rate_edit <-
read_excel(here::here("dataset/unemployment_rate_edit.xlsx"),
range = "A12:M32")
(unemployment_rate_edit)
## # A tibble: 20 x 13
## Year Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2000 4 4.1 4 3.8 4 4 4 4.1 3.9 3.9 3.9 3.9
## 2 2001 4.2 4.2 4.3 4.4 4.3 4.5 4.6 4.9 5 5.3 5.5 5.7
## 3 2002 5.7 5.7 5.7 5.9 5.8 5.8 5.8 5.7 5.7 5.7 5.9 6
## 4 2003 5.8 5.9 5.9 6 6.1 6.3 6.2 6.1 6.1 6 5.8 5.7
## 5 2004 5.7 5.6 5.8 5.6 5.6 5.6 5.5 5.4 5.4 5.5 5.4 5.4
## 6 2005 5.3 5.4 5.2 5.2 5.1 5 5 4.9 5 5 5 4.9
## 7 2006 4.7 4.8 4.7 4.7 4.6 4.6 4.7 4.7 4.5 4.4 4.5 4.4
## 8 2007 4.6 4.5 4.4 4.5 4.4 4.6 4.7 4.6 4.7 4.7 4.7 5
## 9 2008 5 4.9 5.1 5 5.4 5.6 5.8 6.1 6.1 6.5 6.8 7.3
## 10 2009 7.8 8.3 8.7 9 9.4 9.5 9.5 9.6 9.8 10 9.9 9.9
## 11 2010 9.8 9.8 9.9 9.9 9.6 9.4 9.4 9.5 9.5 9.4 9.8 9.3
## 12 2011 9.1 9 9 9.1 9 9.1 9 9 9 8.8 8.6 8.5
## 13 2012 8.3 8.3 8.2 8.2 8.2 8.2 8.2 8.1 7.8 7.8 7.7 7.9
## 14 2013 8 7.7 7.5 7.6 7.5 7.5 7.3 7.2 7.2 7.2 6.9 6.7
## 15 2014 6.6 6.7 6.7 6.2 6.3 6.1 6.2 6.1 5.9 5.7 5.8 5.6
## 16 2015 5.7 5.5 5.4 5.4 5.6 5.3 5.2 5.1 5 5 5.1 5
## 17 2016 4.8 4.9 5 5.1 4.8 4.9 4.8 4.9 5 4.9 4.7 4.7
## 18 2017 4.7 4.6 4.4 4.5 4.4 4.3 4.3 4.4 4.2 4.1 4.2 4.1
## 19 2018 4 4.1 4 4 3.8 4 3.8 3.8 3.7 3.8 3.8 3.9
## 20 2019 4 3.8 3.8 3.7 3.7 3.6 3.6 3.7 3.5 3.6 3.6 3.6
GDP_current_dollars_edit <-
read_excel(here::here("dataset/current_dollars GDP_edit.xls"),
range = "A6:V7")
(GDP_current_dollars_edit)
## # A tibble: 1 x 22
## GeoFips GeoName `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007` `2008`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00000 United~ 1.03e7 1.06e7 1.09e7 1.15e7 1.22e7 1.30e7 1.38e7 1.45e7 1.47e7
## # ... with 11 more variables: `2009` <dbl>, `2010` <dbl>, `2011` <dbl>,
## # `2012` <dbl>, `2013` <dbl>, `2014` <dbl>, `2015` <dbl>, `2016` <dbl>,
## # `2017` <dbl>, `2018` <dbl>, `2019` <dbl>
GDP_per_capita_states_edit <-
read_excel(here::here("dataset/GDP_per_capita_by_states_edit.xls"),
range = "A6:V66")
(GDP_per_capita_states_edit)
## # A tibble: 60 x 22
## GeoFips GeoName `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00000 United~ 30657 31589 31832 32681 34251 35849 38114 39844
## 2 01000 Alabama 24338 25104 25712 26693 28415 29832 31362 32598
## 3 02000 Alaska~ 31974 33517 34634 35841 36916 38919 41058 43861
## 4 04000 Arizona 26235 26803 27149 28161 30012 32220 34686 35789
## 5 05000 Arkans~ 22762 23840 24266 25524 26924 28047 29438 31070
## 6 06000 Califo~ 33364 34043 34214 35415 37369 39279 42088 43629
## 7 08000 Colora~ 34187 35023 34608 34935 35870 37841 40140 42024
## 8 09000 Connec~ 43102 45223 44914 45254 47557 50035 53945 57861
## 9 10000 Delawa~ 33907 36952 37693 38013 38970 39405 41041 41687
## 10 11000 Distri~ 43344 44695 45136 46047 50201 53186 56075 60028
## # ... with 50 more rows, and 12 more variables: `2008` <dbl>, `2009` <dbl>,
## # `2010` <dbl>, `2011` <dbl>, `2012` <dbl>, `2013` <dbl>, `2014` <dbl>,
## # `2015` <dbl>, `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2019` <dbl>
real_household_median_income_edit <-
read_excel(here::here("dataset/median_income_edit.xls"),
range = "A28:B47",
col_names = FALSE)
## New names:
## * `` -> ...1
## * `` -> ...2
names(real_household_median_income_edit)[1:2] <-
c("year", "median_income")
real_household_median_income_edit <-
real_household_median_income_edit %>%
mutate(year = format(real_household_median_income_edit$year, format="%Y"))
(real_household_median_income_edit)
## # A tibble: 20 x 2
## year median_income
## <chr> <dbl>
## 1 2000 62512
## 2 2001 61126
## 3 2002 60435
## 4 2003 60360
## 5 2004 60150
## 6 2005 60794
## 7 2006 61268
## 8 2007 62090
## 9 2008 59877
## 10 2009 59458
## 11 2010 57904
## 12 2011 57021
## 13 2012 56912
## 14 2013 58904
## 15 2014 58001
## 16 2015 60987
## 17 2016 62898
## 18 2017 63761
## 19 2018 64324
## 20 2019 68703
GDP_real_term_edit <-
read_excel(here::here("dataset/real_GDP_chained_2012_edit.xls"),
range = "A6:V7")
(GDP_real_term_edit)
## # A tibble: 1 x 22
## GeoFips GeoName `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007` `2008`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 00000 United~ 1.31e7 1.33e7 1.35e7 1.39e7 1.44e7 1.49e7 1.53e7 1.56e7 1.56e7
## # ... with 11 more variables: `2009` <dbl>, `2010` <dbl>, `2011` <dbl>,
## # `2012` <dbl>, `2013` <dbl>, `2014` <dbl>, `2015` <dbl>, `2016` <dbl>,
## # `2017` <dbl>, `2018` <dbl>, `2019` <dbl>
Gini_coefficient_edit <-
read_excel(here::here("dataset/Gini_coefficient_edit.xlsx"),
range = "A1:X2")
names(Gini_coefficient_edit)[5:24] <-
(str_sub(names(Gini_coefficient_edit)[5:24], 1, 4))
Gini_coefficient_edit[1,24] = NA
(Gini_coefficient_edit)
## # A tibble: 1 x 24
## `Series Name` `Series Code` `Country Name` `Country Code` `2000` `2001` `2002`
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Gini index (~ SI.POV.GINI United States USA 40.1 40.6 40.4
## # ... with 17 more variables: `2003` <dbl>, `2004` <dbl>, `2005` <dbl>,
## # `2006` <dbl>, `2007` <dbl>, `2008` <dbl>, `2009` <dbl>, `2010` <dbl>,
## # `2011` <dbl>, `2012` <dbl>, `2013` <dbl>, `2014` <dbl>, `2015` <dbl>,
## # `2016` <dbl>, `2017` <dbl>, `2018` <dbl>, `2019` <chr>
Reference:
“Count the Number of Characters (or Bytes or Width).” https://stat.ethz.ch/R-manual/R-devel/library/base/html/nchar.html. Accessed 09 April 2021.
“Extract data frame cell value.” datacamp, https://campus.datacamp.com/courses/model-a-quantitative-trading-strategy-in-r/chapter-1-introduction-to-r-for-trading?ex=4. Accessed 09 April 2021.
MacQueen, Don. “[R] Extract year from date.” 09 March 2015, https://stat.ethz.ch/pipermail/r-help/2015-March/426643.html. Accessed 08 April 2021.
“Rename Data Frame Columns in R.” Datanovia, https://www.datanovia.com/en/lessons/rename-data-frame-columns-in-r/. Accessed 08 April 2021.
“Specify cells for reading.” readxl, https://readxl.tidyverse.org/reference/cell-specification.html. Accessed 08 April 2021.
Below is the post for this week:
For the last several weeks, we have already combined several datasets. Initially, we used the table of violent crime rates and property crime rates in 100000 people (table name in the dataset folder is: “table-1”), and we combined the table with other relevant datasets including GDP in real term in millions (table name in the dataset folder is: “real_GDP_chained_2012”), GDP in current dollars in millions (table name in the dataset folder is: “current_dollars GDP”), monthly unemployment rate (table name in the dataset folder is: “unemployment_rate”), Gini coefficient (table name in the dataset folder is: “Gini_coefficient”), real median income (table name in the dataset folder is: “median_income”), and personal income per capita in current dollars by states (table name in the dataset folder is: “GDP_per_capita_by_states”). All of the datasets have the variable year from 2000 to 2019, except that the Gini coefficient dataset only has the data from 2000 to 2018, which allows us we to join them together.
To combine them, we mostly did left join by the variable “year”. From researching those datasets, we found that both violent and property crime rates show trends of decreasing in general as year increases. We also found potential reasons of the decreasing trends. We did linear models, from which we found that as GDP increases (both in real term and in current dollars), crime rates (both violent crime rates and property crime rates) decrease. In addition, real median income can also be one reason, as we found that as real median income increases, both violent crime rates and property crime rates decrease. Moreover, we found that Gini coefficient and GDP (either in real term or in current dollars) can predict crime rates better than Gini coefficient itself. However, it is hard to find a good linear model with unemployment rate as one of the predictors.
When it comes to the difficulty to combine the data, since the datasets all have the variable year, so it was not so hard to do left join. One thing we encountered was that the types of the variable “year” are different sometimes (for example, double for one dataset and character for others). But it could be solved by making the type same first.
So far, we have analyzed the crime rates in a macro scale. We explored the trends of violent and property crime rates from 2000 to 2019 and basically researched on what macro factors can impact the crime rates in the US. For future research, we plan to find more data about weapon holding and explore the relationships between crime rates and weapon holding, and maybe incorporate it into the interactive dashboard.
Reference:
“Databases, Tables & Calculators by Subject.” U.S. Bureau of Labor Statistics, https://data.bls.gov/timeseries/LNS14000000?years_option=all_years. Accessed 19 March 2021.
“Real Median Household Income in the United States.” FRED, https://fred.stlouisfed.org/series/MEHOINUSA672N. Accessed 01 April 2021.
SAGDP2N Gross domestic product (GDP) by state 1/Gross domestic product (GDP) by state: All industry total (Millions of current dollars)." Bureau of Economic Analysis, https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1. Accessed 01 April 2021.
“SAGDP9N Real GDP by state 1/Real GDP by state: All industry total (Millions of chained 2012 dollars).” Bureau of Economic Analysis, https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1. Accessed 01 April 2021.
“SAINC1 Personal Income Summary: Personal Income, Population, Per Capita Personal Income.” Bureau of Economic Analysis, https://apps.bea.gov/itable/iTable.cfm?ReqID=70&step=1. Accessed 26 March 2021.
“Table 1.” FBI:UCR, https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/topic-pages/tables/table-1. Accessed 19 March 2021.
“World Development Indicators.” THE WORLD BANK, https://databank.worldbank.org/reports.aspx?source=2&series=SI.POV.GINI&country=USA. Accessed 01 April 2021.