CREATE EXTENSION fuzzystrmatch;
create table x_names (
id SERIAL NOT NULL PRIMARY KEY,
first_name text,
last_name text,
gender varchar(1),
country varchar(2),
attrs jsonb
);
create table x_names_attrs (
id SERIAL NOT NULL PRIMARY KEY,
name_id integer NOT NULL,
key text,
value text
);
ALTER TABLE x_names_attrs ADD CONSTRAINT x_names_attrs_name_id_fk
FOREIGN KEY (name_id) REFERENCES x_names (id)
ON DELETE CASCADE;
CREATE INDEX x_names_attrs_name_id_idx ON x_names_attrs (name_id);
-- alternative to pure FK index, does not seem to help with the queries we need
CREATE INDEX x_names_attrs_name_id_key_idx ON x_names_attrs (name_id, key);
-- copy of names from CSV file, in the end 10M names were used for experiments
COPY x_names(first_name, last_name, gender, country)
FROM '/vagrant/us.csv'
DELIMITER ','
;
-- lateral join, seems fine when we need l and m in select too, but is slower than the next select
SELECT count(*)
from x_names,
levenshtein_less_equal(first_name, 'Brad', 5) l,
levenshtein_less_equal(last_name, 'Brad', 5) m
where l <= 5 or m <= 5
;
-- fastest variant, using columns
SELECT count(*)
from x_names
where levenshtein_less_equal(first_name, 'Brad', 5) <= 5
or levenshtein_less_equal(last_name, 'Brad', 5) <= 5
-- filling the JSONB column, choose small or larger JSON content, it changes the speeds of full-scan
update x_names
-- small JSONB
set attrs = ('{"first_name": "' || first_name || '", "last_name": "' || last_name || '", "names": ["' || first_name || '", "' || last_name || '"]}')::jsonb
-- bit bigger JSONB
-- set attrs = ('{"first_name": "' || first_name || '", "last_name": "' || last_name || '", "names": ["' || first_name || '", "' || last_name || '"],' ||
-- ' "xxx1": "yyy1", "xxx2": "yyy2", "xxx3": "yyy3", "xxx4": "yyy4", "xxx5": "yyy5", "xxx6": "yyy7", "xxx7": "yyy7", "xxx8": "yyy8",' ||
-- ' "xxx11": "yyy11", "xxx12": "yyy12", "xxx13": "yyy13", "xxx14": "yyy14", "xxx15": "yyy15", "xxx16": "yyy16", "xxx17": "yyy17", "xxx18": "yyy18",' ||
-- ' "xxx21": "yyy21", "xxx22": "yyy22", "xxx23": "yyy23", "xxx24": "yyy24", "xxx25": "yyy25", "xxx26": "yyy26", "xxx27": "yyy27", "xxx28": "yyy28"}')::jsonb
;
-- Showing matching names (from first 50), unrolling the names multi-val JSON value.
-- This, again, uses lateral join, which is not best for WHERE, see the query later.
select *
from x_names,
jsonb_array_elements_text(attrs -> 'names') n,
levenshtein_less_equal(n, 'Brad', 5) l
where id <= 50
and l < 5
;
-- Using single value from JSONB (similar to single column, just marginally slower).
SELECT count(*)
from x_names
where levenshtein_less_equal(attrs ->> 'first_name', 'Brad', 5) <= 5;
-- This checks any of the multi-value names:
SELECT count(*)
from x_names
where exists (select 1 from jsonb_array_elements_text(attrs -> 'names') n
where levenshtein_less_equal(n, 'Brad', 5) <= 5)
-- Now filling the detail table:
insert into x_names_attrs (name_id, key, value)
select id, key, coalesce(aval, sval) val from (
select id, key,
case when jsonb_typeof(value) = 'array' then value end avals,
case when jsonb_typeof(value) <> 'array' then value end sval
from x_names, jsonb_each(attrs)
where id > 1000000
) x left join jsonb_array_elements(avals) aval on true;
-- query using detail table and two "columns"
SELECT count(*)
from x_names n
where exists(select 1 from x_names_attrs
where name_id = n.id
and (key = 'first_name' and levenshtein_less_equal(value, 'Brad', 5) <= 5
or key = 'last_name' and levenshtein_less_equal(value, 'Brad', 5) <= 5));
-- detail table, query for multi-val names key
SELECT count(*)
from x_names
where exists(select 1 from x_names_attrs
where name_id = n.id
and key = 'names'
and levenshtein_less_equal(value, 'Brad', 5) <= 5);
-- For other operations like % and LIKE/ILIKE we can build trigram indexes:
-- One for specific key in JSON (all values must be single-value texts):
create index x_names_afn_trg_idx
on x_names using gin((attrs ->> 'first_name') gin_trgm_ops);
-- Tried this to make the query for 'names' in detail table faster, but did not work.
create index x_names_attrs_names_trg_idx
on x_names_attrs using gin(value gin_trgm_ops)
where key = 'names';
-- Very fast query using JSONB and trigram index (note % operation, levenshtein does not use index):
SELECT count(*)
from x_names n
where (attrs ->> 'first_name') % 'brady';