Computes the percentage of non-null values for each column across one or more datasets.
Arguments
- ...
One or more data frames, dbplyr::tbl_lazy references, or character table names to profile.
- con
A DBI connection object from
DBI::dbConnect(). Optional when all inputs are dbplyr::tbl_lazy references.
Value
A tibble::tibble() with columns table, column, n_total,
n_non_null, and pct_non_null.
Examples
df <- data.frame(
unique_id = 1:20,
first_name = c(
'John', 'Jon', 'Jane', 'Jane', 'Bob',
'Bobby', 'Alice', 'Alicia', 'Tom', 'Thomas',
'John', 'Jon', 'Jane', 'Janet', 'Bob',
'Robert', 'Alice', 'Alison', 'Tom', 'Tomas'
),
surname = c(
'Smith', 'Smith', 'Doe', 'Doe', 'Jones',
'Jones', 'Brown', 'Brown', 'White', 'White',
'Smith', 'Smyth', 'Doe', 'Doe', 'Jones',
'Jones', 'Brown', 'Browne', 'White', 'White'
),
dob = c(
'1990-01-01', '1990-01-01', '1985-06-15', '1985-06-15',
'2000-12-01', '2000-12-01', '1975-03-22', '1975-03-22',
'1988-07-04', '1988-07-04', '1990-01-01', '1990-01-02',
'1985-06-15', '1985-06-16', '2000-12-01', '2000-12-02',
'1975-03-22', '1975-03-23', '1988-07-04', '1988-07-05'
),
city = c(
'London', 'London', 'Paris', 'Paris', 'Berlin',
'Berlin', 'Rome', 'Rome', 'Madrid', 'Madrid',
'London', 'London', 'Paris', 'Paris', 'Berlin',
'Berlin', 'Rome', 'Rome', 'Madrid', 'Madrid'
),
email = c(
'john@example.com', 'jon@example.com', 'jane@example.com',
'jane@example.com', 'bob@example.com', 'bobby@example.com',
'alice@example.com', 'alicia@example.com', 'tom@example.com',
'thomas@example.com', 'john@example.com', 'jon@example.com',
'jane@example.com', 'janet@example.com', 'bob@example.com',
'robert@example.com', 'alice@example.com', 'alison@example.com',
'tom@example.com', 'tomas@example.com'
)
)
con <- DBI::dbConnect(duckdb::duckdb())
il_completeness(df, con = con)
#> # A tibble: 6 × 5
#> table column n_total n_non_null pct_non_null
#> <chr> <chr> <int> <int> <dbl>
#> 1 table_1 unique_id 20 20 100
#> 2 table_1 first_name 20 20 100
#> 3 table_1 surname 20 20 100
#> 4 table_1 dob 20 20 100
#> 5 table_1 city 20 20 100
#> 6 table_1 email 20 20 100
DBI::dbDisconnect(con, shutdown = TRUE)
