Yes - I partition the BigQuery table by day, and then delete the duplicates with a cloud function as follows. I use a BigQuery Transfer job to copy the AWS data from S3 into BigQuery once a day, and this cloud function triggers to tidy up.
// Copyright 2021 Genome Research Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
'use strict';
const {BigQuery} = require('@google-cloud/bigquery')
const bq = {
location: 'EU',
project: "YOUR_GOOGLE_PROJECT",
dataset: "YOUR_AWS_DATASET",
table: "YOUR_AWS_TABLE"
}
async function runQuery() {
const bigquery = new BigQuery()
const query = `
DECLARE max_date TIMESTAMP;
DECLARE current_bill_period TIMESTAMP;
SET max_date = (SELECT MAX(_PARTITIONTIME)
FROM \`${bq.project}.${bq.dataset}.${bq.table}\`);
SET current_bill_period = (SELECT MAX(bill_billing_period_start_date)
FROM \`${bq.project}.${bq.dataset}.${bq.table}\`
WHERE _PARTITIONTIME = max_date);
DELETE FROM \`${bq.project}.${bq.dataset}.${bq.table}\`
WHERE
bill_billing_period_start_date = current_bill_period
AND _PARTITIONTIME < max_date;
`
const options = {
query: query,
location: bq.location
}
const [job] = await bigquery.createQueryJob(options);
console.log(`Bigquery delete Job ${job.id} started`)
const [rows] = await job.getQueryResults()
console.log("Results:")
rows.forEach(row => console.log(row))
}
exports.deleteAWSDupBill = (message, context) => {
const data = JSON.parse(Buffer.from(message.data, 'base64').toString());
if (data.state === "SUCCEEDED" &&
data.destinationDatasetId === bq.dataset &&
data.params.destination_table_name_template === bq.table) {
runQuery()
}
};
For further actions, you may consider blocking this person and/or reporting abuse
We're a place where coders share, stay up-to-date and grow their careers.
Did you find a solution for this by any chance?
Yes - I partition the BigQuery table by day, and then delete the duplicates with a cloud function as follows. I use a BigQuery Transfer job to copy the AWS data from S3 into BigQuery once a day, and this cloud function triggers to tidy up.