In this blog I will examine how the number of rows when INSERT multiple ones impacts timing.
The PostgreSQL database "INSERT multiple rows" approach means that a single PostgreSQL INSERT statement inserts multiple rows into a table.
The approach has the following syntax:
INSERT INTO table_name (column_list)
VALUES
(value_list_1),
(value_list_2),
...
(value_list_n);
Assume I have a table:
create table my_table(a varchar(100), b timestamp, c bigint);
postgres=# create table my_table(a varchar(100), b timestamp, c bigint);
CREATE TABLE
postgres=# \d my_table
Table "public.my_table"
Column | Type | Collation | Nullable | Default
--------+-----------------------------+-----------+----------+---------
a | character varying(100) | | |
b | timestamp without time zone | | |
c | bigint | | |
postgres=#
To populate it using INSERT multiple rows I will use PostgreSQL function.
It gets 2 input parameters:
- n_of_recs - number of records to insert in a table
- chunk_size - number of rows inserted at once using the INSERT multiple rows approach
create or replace function populate_table(n_of_recs bigint, chunk_size smallint)
returns bigint
language plpgsql
as
$$
declare
time_ms bigint := 0;
random_varchar_length smallint;
random_varchar varchar(100);
random_timestamp timestamp;
random_int bigint;
query text;
start_ts timestamp;
end_ts timestamp;
begin
if chunk_size <= 0 then
raise exception 'The chunk_size should be positive.';
end if;
select clock_timestamp() into start_ts;
query := 'insert into my_table values ';
for idx in 1..n_of_recs loop
-- some random varchar length between 1 and 100
random_varchar_length := floor(random()*(100-1+1))+1;
-- some random varchar
random_varchar := array_to_string(array(select chr((ascii('a') + round(random() * 25)) :: integer) from generate_series(1,random_varchar_length)), '');
-- some random int between 55 and 777777777
random_int := floor(random()*(777777777-55+1))+55;
-- some random timestamp between '1799-01-01 00:00:00' and '2080-01-01 00:00:00'
random_timestamp := timestamp '1799-01-01 00:00:00' + random() * (timestamp '2080-01-01 00:00:00' - timestamp '1799-01-01 00:00:00');
if (idx = n_of_recs) or (idx % chunk_size = 0) then
query := query || format('(''%s'', ''%s'', %s);', random_varchar, random_timestamp, random_int);
-- raise notice 'Flash. Populated Total: % recs', idx;
execute query;
query := 'insert into my_table values ';
else
query := query || format('(''%s'', ''%s'', %s), ', random_varchar, random_timestamp, random_int);
end if;
end loop;
select clock_timestamp() into end_ts;
select
round ((
(extract(epoch from end_ts) -
extract(epoch from start_ts))) * 1000)
into time_ms;
return time_ms;
end;
$$;
CREATE FUNCTION
postgres=# \df populate_table
List of functions
Schema | Name | Result data type | Argument data types | Type
--------+----------------+------------------+---------------------------------------+------
public | populate_table | bigint | n_of_recs bigint, chunk_size smallint | func
(1 row)
postgres=#
Now let's examine how timing of the inserting data in a table using the INSERT multiple rows approach depends on the number of rows inserted at once.
postgres=# create table check_the_timing(n_of_recs bigint, chunk_size smallint, timing_ms bigint);
CREATE TABLE
postgres=# \d check_the_timing
Table "public.check_the_timing"
Column | Type | Collation | Nullable | Default
------------+----------+-----------+----------+---------
n_of_recs | bigint | | |
chunk_size | smallint | | |
timing_ms | bigint | | |
Let's check how long does it take to insert 100000 rows into the my_table using the INSERT multiple rows approach, each insert has 100 values flashed at once.
postgres=# select populate_table(100000::bigint, 100::smallint)::bigint timing_ms;
timing_ms
-----------
5734
(1 row)
postgres=#
According to the output it takes 5734 milliseconds.
Let's examine how long does it take to insert 200000 rows into the my_table using the INSERT multiple rows approach.
Each insert has values flashed at once, where the iterates from 5000 to 20000 by step 2500.
For the purity of the experiment, I will empty the table my_table on every probe.
do
$do$
declare
n_chunk_size smallint := 20000;
begin
for n_rows_in_a_single_insert in 5000..n_chunk_size by 2500 loop
truncate table my_table;
with n_of_recs as (select 200000::bigint val),
chunk_size as (select n_rows_in_a_single_insert val),
timing_ms as (select populate_table(n_of_recs.val::bigint, chunk_size.val::smallint)::bigint val from n_of_recs, chunk_size)
insert into check_the_timing select n_of_recs.val, chunk_size.val, timing_ms.val
from n_of_recs, chunk_size, timing_ms;
end loop;
end;
$do$;
Let's analyze obtained results:
select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
postgres=# select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
n_of_recs | chunk_size | timing_ms
-----------+------------+-----------
200000 | 5000 | 17575
200000 | 7500 | 20748
200000 | 10000 | 26458
200000 | 12500 | 29194
200000 | 15000 | 31886
200000 | 17500 | 35876
200000 | 20000 | 44726
(7 rows)
Let's present the data as a chart:
Another example.
How long does it take to insert 1000000 rows into the my_table using the INSERT multiple rows approach?
postgres=# create table check_the_timing(n_of_recs bigint, chunk_size bigint, timing_ms bigint);
CREATE TABLE
postgres=# \d check_the_timing
Table "public.check_the_timing"
Column | Type | Collation | Nullable | Default
------------+--------+-----------+----------+---------
n_of_recs | bigint | | |
chunk_size | bigint | | |
timing_ms | bigint | | |
postgres=#
create or replace function populate_table(n_of_recs bigint, chunk_size bigint)
returns bigint
language plpgsql
as
$$
declare
time_ms bigint := 0;
random_varchar_length bigint;
random_varchar varchar(100);
random_timestamp timestamp;
random_int bigint;
query text;
start_ts timestamp;
end_ts timestamp;
begin
if chunk_size <= 0 then
raise exception 'The chunk_size should be positive.';
end if;
select clock_timestamp() into start_ts;
query := 'insert into my_table values ';
for idx in 1..n_of_recs loop
-- some random varchar length between 1 and 100
random_varchar_length := floor(random()*(100-1+1))+1;
-- some random varchar
random_varchar := array_to_string(array(select chr((ascii('a') + round(random() * 25)) :: integer) from generate_series(1,random_varchar_length)), '');
-- some random int between 55 and 777777777
random_int := floor(random()*(777777777-55+1))+55;
-- some random timestamp between '1799-01-01 00:00:00' and '2080-01-01 00:00:00'
random_timestamp := timestamp '1799-01-01 00:00:00' + random() * (timestamp '2080-01-01 00:00:00' - timestamp '1799-01-01 00:00:00');
if (idx = n_of_recs) or (idx % chunk_size = 0) then
query := query || format('(''%s'', ''%s'', %s);', random_varchar, random_timestamp, random_int);
raise notice 'Flash. Populated Total: % recs', idx;
execute query;
query := 'insert into my_table values ';
else
query := query || format('(''%s'', ''%s'', %s), ', random_varchar, random_timestamp, random_int);
end if;
end loop;
select clock_timestamp() into end_ts;
select
round ((
(extract(epoch from end_ts) -
extract(epoch from start_ts))) * 1000)
into time_ms;
return time_ms;
end;
$$;
postgres=# \df populate_table
List of functions
Schema | Name | Result data type | Argument data types | Type
--------+----------------+------------------+-------------------------------------+------
public | populate_table | bigint | n_of_recs bigint, chunk_size bigint | func
(1 row)
postgres=#
Each insert has values flashed at once, where the iterates from 2500 to 40000 by step 5000.
For the purity of the experiment, I will empty the table my_table on every probe.
do
$do$
begin
for n_rows_in_a_single_insert in 2500..40000 by 5000 loop
raise notice 'n_rows_in_a_single_insert: %', n_rows_in_a_single_insert;
truncate table my_table;
with n_of_recs as (select 1000000::bigint val),
chunk_size as (select n_rows_in_a_single_insert val),
timing_ms as (select populate_table(n_of_recs.val::bigint, chunk_size.val::bigint)::bigint val from n_of_recs, chunk_size)
insert into check_the_timing select n_of_recs.val, chunk_size.val, timing_ms.val
from n_of_recs, chunk_size, timing_ms;
end loop;
end;
$do$;
Let's analyze the obtained results:
select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
postgres=# select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
n_of_recs | chunk_size | timing_ms
-----------+------------+-----------
1000000 | 2500 | 75140
1000000 | 7500 | 105269
1000000 | 12500 | 143569
1000000 | 17500 | 189686
1000000 | 22500 | 235968
1000000 | 27500 | 307523
1000000 | 32500 | 399844
1000000 | 37500 | 503795
(8 rows)
postgres=#
Conclusion:
In this blog I demonstrated the approach to examine how the number of rows when INSERT multiple ones together has an impact on timing.
Top comments (1)
It would be useful to see how network round trips effect this timing.