Saturday, February 2, 2019

How to find and delete duplicates in PostgreSQL?

A problem that I encounter very often in practice is  finding duplicates from a table and eliminate them.
Since I found a nice way to eliminate them, I thought that I could share.

-- create table 
create table duplicates_example
(
    duplicates_example_id numeric(10) not null primary key,
    firstname varchar(255),
    lastname varchar(255),
    phone varchar(255),
    email varchar(255)
);

-- create sequence 
CREATE SEQUENCE duplicates_example_seq START 1000000 OWNED BY duplicates_example.duplicates_example_id;

Now, let's populate the table.

insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName1', 'LastName1', '12345', 'test@tes.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName1', 'LastName1', '12345', 'test@tes.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName1', 'LastName2', '12345', 'test@tes.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName2', 'LastName2', '123457', 'test@tes.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName2', 'LastName2', '123457', 'test@tes.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName2', 'LastName2', '123457', 'test@tes.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName3', 'LastName3', '523457', 'test3@tes3.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName3', 'LastName3', '523457', 'test3@tes3.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName3', 'LastName3', '523457', 'test3@tes3.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName3', 'LastName3', '523457', 'test3@tes3.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName3', 'LastName4', '523457', 'test3@tes3.com'; 
insert into duplicates_example select distinct nextval('duplicates_example_seq'), 'FirstName3', 'LastName4', '523457', 'test3@tes3.com';

Query the table: select * from duplicates_example order by lastname;

We get:

Now, let's write a query for finding the duplicates.

SELECT * FROM 
(
  SELECT duplicates_example_id,firstname, lastname, phone, email,
  ROW_NUMBER() OVER(PARTITION BY firstname, lastname, phone, email 
  ORDER BY lastname ) AS Row
  FROM duplicates_example
) dups 
WHERE dups.Row > 1;

And the result is:

So, now we have the duplicates and we can delete them. We shall store the duplicates in a table.

CREATE TABLE dups AS
SELECT * FROM 
(
  SELECT duplicates_example_id,firstname, lastname, phone, email,
  ROW_NUMBER() OVER(PARTITION BY firstname, lastname, phone, email 
  ORDER BY lastname ) AS Row
  FROM duplicates_example
) dups 
WHERE dups.Row > 1;

Now we shall delete the duplicates:

delete from duplicates_example
where exists (select 1 from dups d where d.duplicates_example_id  = duplicates_example.duplicates_example_id);

Let's query the table again: select * from duplicates_example;

And see the results:

And done! We eliminated the duplicates!