PostgreSQL database INSERT multiple rows: how the number of rows impacts timing?

#postgres #devops #database #productivity

In this blog I will examine how the number of rows when INSERT multiple ones impacts timing.

The PostgreSQL database "INSERT multiple rows" approach means that a single PostgreSQL INSERT statement inserts multiple rows into a table.

The approach has the following syntax:

INSERT INTO table_name (column_list)
VALUES
    (value_list_1),
    (value_list_2),
    ...
    (value_list_n);

Assume I have a table:

create table my_table(a varchar(100), b timestamp, c bigint);

postgres=# create table my_table(a varchar(100), b timestamp, c bigint);
CREATE TABLE
postgres=# \d my_table
                        Table "public.my_table"
 Column |            Type             | Collation | Nullable | Default
--------+-----------------------------+-----------+----------+---------
 a      | character varying(100)      |           |          |
 b      | timestamp without time zone |           |          |
 c      | bigint                      |           |          |

postgres=#

To populate it using INSERT multiple rows I will use PostgreSQL function.

It gets 2 input parameters:

n_of_recs - number of records to insert in a table
chunk_size - number of rows inserted at once using the INSERT multiple rows approach

create or replace function populate_table(n_of_recs bigint, chunk_size smallint)
returns bigint 
language plpgsql
as
$$
declare
   time_ms bigint := 0;
   random_varchar_length smallint;
   random_varchar varchar(100);
   random_timestamp timestamp;
   random_int bigint;   
   query text; 
   start_ts timestamp;
   end_ts timestamp;
begin

      if chunk_size <= 0 then

         raise exception 'The chunk_size should be positive.';

      end if;

      select clock_timestamp() into start_ts;    

      query := 'insert into my_table values ';      

      for idx in 1..n_of_recs loop      

         -- some random varchar length between 1 and 100
         random_varchar_length := floor(random()*(100-1+1))+1;

         -- some random varchar
         random_varchar := array_to_string(array(select chr((ascii('a') + round(random() * 25)) :: integer) from generate_series(1,random_varchar_length)), ''); 

         -- some random int between 55 and 777777777
         random_int := floor(random()*(777777777-55+1))+55;

         -- some random timestamp between '1799-01-01 00:00:00' and '2080-01-01 00:00:00'
         random_timestamp := timestamp '1799-01-01 00:00:00' + random() * (timestamp '2080-01-01 00:00:00' - timestamp '1799-01-01 00:00:00');

         if (idx = n_of_recs) or (idx % chunk_size = 0) then

            query := query || format('(''%s'', ''%s'', %s);', random_varchar, random_timestamp, random_int);

            -- raise notice 'Flash. Populated Total: % recs', idx;

            execute query;

            query := 'insert into my_table values '; 

         else

            query := query || format('(''%s'', ''%s'', %s), ', random_varchar, random_timestamp, random_int);

         end if;

      end loop;    

      select clock_timestamp() into end_ts; 

      select
         round ((
            (extract(epoch from end_ts) -
            extract(epoch from start_ts))) * 1000)
         into time_ms;

      return time_ms;

end; 
$$;


CREATE FUNCTION
postgres=# \df populate_table
                                     List of functions
 Schema |      Name      | Result data type |          Argument data types          | Type
--------+----------------+------------------+---------------------------------------+------
 public | populate_table | bigint           | n_of_recs bigint, chunk_size smallint | func
(1 row)

postgres=#

Now let's examine how timing of the inserting data in a table using the INSERT multiple rows approach depends on the number of rows inserted at once.

postgres=# create table check_the_timing(n_of_recs bigint, chunk_size smallint, timing_ms bigint);
CREATE TABLE
postgres=# \d check_the_timing
            Table "public.check_the_timing"
   Column   |   Type   | Collation | Nullable | Default
------------+----------+-----------+----------+---------
 n_of_recs  | bigint   |           |          |
 chunk_size | smallint |           |          |
 timing_ms  | bigint   |           |          |

Let's check how long does it take to insert 100000 rows into the my_table using the INSERT multiple rows approach, each insert has 100 values flashed at once.

postgres=# select populate_table(100000::bigint, 100::smallint)::bigint timing_ms;
 timing_ms
-----------
      5734
(1 row)

postgres=#

According to the output it takes 5734 milliseconds.

Let's examine how long does it take to insert 200000 rows into the my_table using the INSERT multiple rows approach.

Each insert has values flashed at once, where the iterates from 5000 to 20000 by step 2500.

For the purity of the experiment, I will empty the table my_table on every probe.

do
$do$
declare
     n_chunk_size smallint := 20000;
begin
   for n_rows_in_a_single_insert in 5000..n_chunk_size by 2500 loop

      truncate table my_table;

      with n_of_recs as (select 200000::bigint val),
         chunk_size as (select n_rows_in_a_single_insert val),
         timing_ms as (select populate_table(n_of_recs.val::bigint, chunk_size.val::smallint)::bigint val from n_of_recs, chunk_size)
      insert into check_the_timing select n_of_recs.val, chunk_size.val, timing_ms.val
      from n_of_recs, chunk_size, timing_ms;
   end loop;
end;
$do$;

Let's analyze obtained results:

select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;

postgres=# select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
 n_of_recs | chunk_size | timing_ms
-----------+------------+-----------
    200000 |       5000 |     17575
    200000 |       7500 |     20748
    200000 |      10000 |     26458
    200000 |      12500 |     29194
    200000 |      15000 |     31886
    200000 |      17500 |     35876
    200000 |      20000 |     44726
(7 rows)

Let's present the data as a chart:

Another example.

How long does it take to insert 1000000 rows into the my_table using the INSERT multiple rows approach?

postgres=# create table check_the_timing(n_of_recs bigint, chunk_size bigint, timing_ms bigint);
CREATE TABLE
postgres=# \d check_the_timing
           Table "public.check_the_timing"
   Column   |  Type  | Collation | Nullable | Default
------------+--------+-----------+----------+---------
 n_of_recs  | bigint |           |          |
 chunk_size | bigint |           |          |
 timing_ms  | bigint |           |          |

postgres=#

create or replace function populate_table(n_of_recs bigint, chunk_size bigint)
returns bigint 
language plpgsql
as
$$
declare
   time_ms bigint := 0;
   random_varchar_length bigint;
   random_varchar varchar(100);
   random_timestamp timestamp;
   random_int bigint;   
   query text; 
   start_ts timestamp;
   end_ts timestamp;
begin

      if chunk_size <= 0 then

         raise exception 'The chunk_size should be positive.';

      end if;

      select clock_timestamp() into start_ts;    

      query := 'insert into my_table values ';      

      for idx in 1..n_of_recs loop      

         -- some random varchar length between 1 and 100
         random_varchar_length := floor(random()*(100-1+1))+1;

         -- some random varchar
         random_varchar := array_to_string(array(select chr((ascii('a') + round(random() * 25)) :: integer) from generate_series(1,random_varchar_length)), ''); 

         -- some random int between 55 and 777777777
         random_int := floor(random()*(777777777-55+1))+55;

         -- some random timestamp between '1799-01-01 00:00:00' and '2080-01-01 00:00:00'
         random_timestamp := timestamp '1799-01-01 00:00:00' + random() * (timestamp '2080-01-01 00:00:00' - timestamp '1799-01-01 00:00:00');

         if (idx = n_of_recs) or (idx % chunk_size = 0) then

            query := query || format('(''%s'', ''%s'', %s);', random_varchar, random_timestamp, random_int);

            raise notice 'Flash. Populated Total: % recs', idx;

            execute query;

            query := 'insert into my_table values '; 

         else

            query := query || format('(''%s'', ''%s'', %s), ', random_varchar, random_timestamp, random_int);

         end if;

      end loop;    

      select clock_timestamp() into end_ts; 

      select
         round ((
            (extract(epoch from end_ts) -
            extract(epoch from start_ts))) * 1000)
         into time_ms;

      return time_ms;

end; 
$$;

postgres=# \df populate_table
                                    List of functions
 Schema |      Name      | Result data type |         Argument data types         | Type
--------+----------------+------------------+-------------------------------------+------
 public | populate_table | bigint           | n_of_recs bigint, chunk_size bigint | func
(1 row)

postgres=#

Each insert has values flashed at once, where the iterates from 2500 to 40000 by step 5000.

For the purity of the experiment, I will empty the table my_table on every probe.

do
$do$
begin
   for n_rows_in_a_single_insert in 2500..40000 by 5000 loop

      raise notice 'n_rows_in_a_single_insert: %', n_rows_in_a_single_insert;

      truncate table my_table;

      with n_of_recs as (select 1000000::bigint val),
         chunk_size as (select n_rows_in_a_single_insert val),
         timing_ms as (select populate_table(n_of_recs.val::bigint, chunk_size.val::bigint)::bigint val from n_of_recs, chunk_size)
      insert into check_the_timing select n_of_recs.val, chunk_size.val, timing_ms.val
      from n_of_recs, chunk_size, timing_ms;
   end loop;
end;
$do$;

Let's analyze the obtained results:

select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;

postgres=# select n_of_recs, chunk_size, timing_ms
from check_the_timing
order by 1, 2;
 n_of_recs | chunk_size | timing_ms
-----------+------------+-----------
   1000000 |       2500 |     75140
   1000000 |       7500 |    105269
   1000000 |      12500 |    143569
   1000000 |      17500 |    189686
   1000000 |      22500 |    235968
   1000000 |      27500 |    307523
   1000000 |      32500 |    399844
   1000000 |      37500 |    503795
(8 rows)

postgres=#