Merge pull request #11 from junder873/update-merge-methods

Overhaul Linking Methodology
junder873 · Apr 20, 2023 · 9821849 · 9821849 · junder873 · Apr 20, 2023
2 parents 4087aea + 0210d5b
commit 9821849
Show file tree

Hide file tree

Showing 32 changed files with 4,016 additions and 1,179 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -15,7 +15,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.5' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'.
+          - '1.6' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'.
           - '1' # Leave this line unchanged. '1' will automatically expand to the latest stable 1.x release of Julia.
         os:
           - ubuntu-latest

diff --git a/Project.toml b/Project.toml
@@ -1,35 +1,27 @@
 name = "WRDSMerger"
 uuid = "59d27aa3-834e-4232-9046-52ef43e86786"
 authors = ["junder873 <junder873@gmail.com>"]
-version = "0.4.0"
+version = "0.5.0"
 
 [deps]
-AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
-BusinessDays = "4f18b42c-503e-5345-9536-bb0f25fc7038"
 DBInterface = "a10d1c49-ce27-4219-8d33-6db1a4562965"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LibPQ = "194296ae-ab2e-5f79-8cd4-7183a0a5a0d1"
-LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
-AbstractTrees = "0.3"
-BusinessDays = "0.9"
-DBInterface = "2.4"
-DataFrames = "1.2"
-LibPQ = "1.8"
-ShiftedArrays = "1"
-julia = "1.5"
-StatsBase = "0.33"
+DBInterface = "2.5"
+DataFrames = "1.5"
+LibPQ = "1.15"
+julia = "1.8"
 
 [extras]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 SQLite = "0aa819cd-b072-5ff4-a722-6bc24af294d9"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 
 [targets]
-test = ["Test", "SQLite", "CSV"]
+test = ["Test", "SQLite", "CSV", "Documenter"]
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,2 +1,6 @@
 [deps]
-Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
+WRDSMerger = "59d27aa3-834e-4232-9046-52ef43e86786"
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,7 +1,51 @@
 using WRDSMerger
 using Documenter
 
+DocMeta.setdocmeta!(
+    WRDSMerger,
+    :DocTestSetup,
+    quote
+        data_dir = joinpath("..", "test", "data")
+        using CSV, DataFrames, WRDSMerger, Dates
+        files = [
+            "crsp_links",
+            "crsp_comp_links",
+            "gvkey_cik_links",
+            "ibes_links",
+            "option_links",
+            "ravenpack_links"
+        ]
+        funs=[
+            generate_crsp_links,
+            generate_comp_crsp_links,
+            generate_comp_cik_links,
+            generate_ibes_links,
+            generate_option_crsp_links,
+            generate_ravenpack_links
+        ]
+        for (file, fun) in zip(files, funs)
+            fun(
+                DataFrame(
+                    CSV.File(joinpath(data_dir, file * ".csv"))
+                )
+            )
+        end
+    end;
+    recursive=true
+)
+
 Documenter.makedocs(
     modules = [WRDSMerger],
     sitename = "WRDSMerger.jl",
-)
+    pages = [
+        "Introduction" => "index.md",
+        "Downloading WRDS Data" => "download_data.md",
+        "Links Between WRDS Data" => [
+            "Linking Basics" => "basic_linking.md",
+            "Default Behavior" => "default_behavior.md",
+            "Identifier Types" => "linking_identifiers.md",
+            "Internals" => "linking_internals.md"
+        ],
+        "Miscellaneous Utilities" => "misc_utilities.md"
+    ]
+)
diff --git a/docs/src/basic_linking.md b/docs/src/basic_linking.md
@@ -0,0 +1,134 @@
+
+```@setup default_behavior
+data_dir = joinpath("..", "..", "test", "data")
+using CSV, DataFrames, WRDSMerger, Dates
+files = [
+    "crsp_links",
+    "crsp_comp_links",
+    "gvkey_cik_links",
+]
+funs=[
+    generate_crsp_links,
+    generate_comp_crsp_links,
+    generate_comp_cik_links,
+]
+for (file, fun) in zip(files, funs)
+    fun(
+        DataFrame(
+            CSV.File(joinpath(data_dir, file * ".csv"))
+        )
+    )
+end
+create_all_links()
+```
+# Basics of Linking Identifiers
+
+A core part of this package is to provide a simple and consistent interface for linking different identifiers in WRDS. One of the primary goals is to reduce the overhead of remembering how exactly to link one dataset to another.
+
+## Downloading and Saving Data
+
+To do so, first download the necessary data from WRDS. This package provides download functions to do so (see [Linking Download Functions](@ref)), which are automatically called by respective generating functions (see [Generating LinkPair Functions](@ref)). The generating functions take in a `DataFrame` (which expects certain column names) and creates the necessary functions between its identifiers. Finally, calling `create_all_links()` will create the remaining links that the tables do not provide.
+
+To provide an example:
+```julia
+julia> db = ODBC.Connection("wrds-pgdata-64");
+julia> generate_crsp_links(db) # downloads the data, creates links between 
+# Permno <-> Permco, Permno <-> NCusip, etc.
+# and returns the data that is downloaded
+
+julia> generate_comp_crsp_links(db) # similar to generate_crsp_links
+
+julia> create_all_links() # defines functions between NCusip <-> GVKey, 
+# Ticker <-> GVKey, etc.
+```
+
+The generate functions return the DataFrame that is downloaded so you can save it locally (with CSV.jl, Arrow.jl, etc.) and can use again as opposed to re-downloading the data.
+
+This package also provides a simple function that runs all of these:
+```julia
+julia> download_all_links(db)
+```
+Which downloads all 6 default tables and returns those 6 DataFrames. Note that if your WRDS account lacks access to one of the tables, you need to change which items are downloaded.
+
+For example, the code I use when starting a project is:
+```julia
+data_dir = joinpath(path_to_saved_files)
+dfs = download_all_links(db)
+files = [
+    "crsp_links",
+    "crsp_comp_links",
+    "gvkey_cik_links",
+    "ibes_links",
+    "option_links",
+    "ravenpack_links"
+]
+# I prefer Arrow.jl and feather files, replace with CSV.jl if desired
+for (df, file) in zip(dfs, files)
+    Arrow.write(joinpath(data_dir, file * ".feather"), df)
+end
+```
+
+Then, whenever I reload the project:
+```julia
+funs=[
+    generate_crsp_links,
+    generate_comp_crsp_links,
+    generate_comp_cik_links,
+    generate_ibes_links,
+    generate_option_crsp_links,
+    generate_ravenpack_links
+]
+for (file, f) in zip(files, funs)
+    @chain joinpath(data_dir, file * ".feather") begin
+        Arrow.Table
+        DataFrame
+        copy
+        f
+    end
+end
+create_all_links()
+```
+
+## Linking Identifiers
+
+Once the initial data is downloaded and necessary functions are created, the package provides a consistent set of methods to convert one identifier to any other. This follows the pattern:
+```
+(ID You Want)((ID You Have)(value), Date for conversion)
+```
+For example:
+```@repl default_behavior
+GVKey(Permno(47896), Date(2020))
+NCusip(CIK(19617), Date(2020)) # works for Int or String
+CIK(Permno(47896), Date(2020))
+CIK(NCusip("46625H21"), Date(2020))
+```
+As you can see, this includes cases where there is not a table providing a direct link (CIK <-> Permno, CIK <-> NCusip). This makes it easy to link the varied datasets in WRDS.
+
+These functions can be easily used with broadcasting:
+```@repl
+GVKey.(Permno.([47896, 44206, 46703]), Date(2020))
+GVKey.(Permno.([47896, 44206, 46703]), [Date(2018), Date(2019), Date(2020)])
+```
+
+Or with other packages such as [DataFramesMeta.jl](https://juliadata.github.io/DataFramesMeta.jl/stable/):
+```julia
+@chain df begin
+    @rtransform(:gvkey = GVKey(Permno(:permno), :date))
+end
+```
+
+
+All of the identifiers that this package provides by default are seen in [Identifier Types](@ref). This is expandable as discussed in [Adding New Identifiers](@ref).
+
+## Generating LinkPair Functions
+
+This section describes the default functions that exist to generate the necessary links.
+
+```@docs
+generate_crsp_links
+generate_comp_crsp_links
+generate_comp_cik_links
+generate_ibes_links
+generate_option_crsp_links
+generate_ravenpack_links
+```
diff --git a/docs/src/default_behavior.md b/docs/src/default_behavior.md
@@ -0,0 +1,88 @@
+```@setup default_behavior
+data_dir = joinpath("..", "..", "test", "data")
+using CSV, DataFrames, WRDSMerger, Dates
+files = [
+    "crsp_links",
+    "crsp_comp_links",
+    "gvkey_cik_links",
+    "ibes_links",
+    "option_links",
+    "ravenpack_links"
+]
+funs=[
+    generate_crsp_links,
+    generate_comp_crsp_links,
+    generate_comp_cik_links,
+    generate_ibes_links,
+    generate_option_crsp_links,
+    generate_ravenpack_links
+]
+for (file, fun) in zip(files, funs)
+    fun(
+        DataFrame(
+            CSV.File(joinpath(data_dir, file * ".csv"))
+        )
+    )
+end
+```
+
+# Default Behavior
+
+This package has some defaults that are important to be aware of during use.
+
+## Different Return Types
+
+The general design principal in Julia is that if a type is a function name, it should return that type. In this package, this is not always the case. When an [`AbstractIdentifier`](@ref) uses an external type (e.g. `Int`), it will return that `AbstractIdentifier`. However, when an `AbstractIdentifier` is used on another `AbstractIdentifier`, it will most often return the underlying value. For example:
+```@repl default_behavior
+Permno(47896) # returns the type Permno
+Permno(Permco(20436), Date(2020)) # an Int type
+```
+
+The reason for this difference is that the `AbstractIdentifier` types are primarily meant for internal use and communicating information to the functions, but it is more often necessary to have the common Julia type for later joins. If it is needed to have the `AbstractIdentifier`, then run:
+```@repl default_behavior
+WRDSMerger.convert_identifier(Permno, Permco(20436), Date(2020))
+```
+
+## Default Options in Conversions
+
+### Parent Firms
+
+Certain [`SecurityIdentifier`](@ref)s have a direct link to a parent firm, most obviously [`Cusip`](@ref) and [`NCusip`](@ref) (with [`Cusip6`](@ref) and [`NCusip6`](@ref)). In certain situations, it can make sense to allow a match to occur through these parent firms, such as when the end goal is to match a `SecurityIdentifier` to a [`FirmIdentifier`](@ref).
+
+For example, consider the case of `NCusip("46625H21")`, which is not in the data. Therefore, when trying to convert his to another `SecurityIdentifier`, it will return `missing` since there is not an exact match:
+```@repl default_behavior
+Permno(NCusip("46625H21"), Date(2020))
+```
+However, if trying to match this `NCusip` to a `FirmIdentifier`, then it will return a match:
+```@repl default_behavior
+Permco(NCusip("46625H21"), Date(2020))
+```
+This is because while the `NCusip` is not in the data, the `NCusip6("46625H")` is:
+```@repl default_behavior
+Permco(NCusip6("46625H"), Date(2020))
+```
+The logic here is that it should not matter if a particular security does not match to a firm if the parent firm of that security does match to a firm. This is very useful if the integrity of the `Cusip` values is in question. This behavior can be disabled or enabled by setting `allow_parent_firm`;
+```@repl default_behavior
+Permno(NCusip("46625H21"), Date(2020); allow_parent_firm=true)
+Permco(NCusip("46625H21"), Date(2020); allow_parent_firm=false)
+```
+
+### Outside of Date Ranges and Singular Matches
+
+Many links are supposed to be only valid for a specific date range. For example, linking `NCusip("16161A10")` to `Permno(47896)` is only valid between 1996-04-01 to 2001-01-01. However, this `NCusip` only ever links to that `Permno`, so the default behavior in this package is to provide that match:
+```@repl default_behavior
+Permno(NCusip("16161A10"), Date(2020)) # outside date range
+```
+If the link does not only provide one potential result (e.g., if that `NCusip` also could go to a different `Permno`), then this will return `missing`. The default behavior can be disabled by setting `allow_inexact_date=false`:
+```@repl default_behavior
+Permno(NCusip("16161A10"), Date(2020); allow_inexact_date=false) # outside date range
+```
+
+## Supremacy of Permno
+
+In WRDS, Permnos are one of the easiest items to link. For example, there are easily accessible tables for linking GVKey <-> Permno, IbesTicker <-> Permno, and NCusip <-> Permno. This makes it very useful for most links. Therefore, when this package is determining the best path for linking two identifiers that are not directly linked (e.g., RPEntity <-> GVKey), this package will default to using Permno even if other paths exist of equal length.
+
+For example, by default, this package links RPEntity to NCusip6. NCusip6 has direct links to both Permno and Permco, both of which directly link to GVKey. The default in this package will choose the path that goes through Permno (RPentity -> NCusip6 -> Permno -> GVKey).
+
+!!! note
+    If there is a shorter path, then it will still choose that (e.g., SecID -> NCusip -> NCusip6 -> RPEntity instead of SecID -> NCusip -> Permno -> NCusip6 -> RPEntity).
diff --git a/docs/src/download_data.md b/docs/src/download_data.md
@@ -0,0 +1,27 @@
+
+# Downloading WRDS Data
+
+
+## Explore WRDS
+```@docs
+list_libraries
+list_tables
+describe_table
+get_table
+raw_sql
+```
+
+
+## Compustat
+```@docs
+comp_data
+```
+
+## CRSP
+```@docs
+crsp_stocknames
+crsp_market
+crsp_data
+crsp_delist
+crsp_adjust
+```