Discussion on: Export AWS/GCP cost and billing data to BigQuery for analytics

View post

Replies for: This is a great post. I too had previously found the very complex AppEngine workflow, but this version is much simpler. I chose to use parquet as...

Did you find a solution for this by any chance?

Yes - I partition the BigQuery table by day, and then delete the duplicates with a cloud function as follows. I use a BigQuery Transfer job to copy the AWS data from S3 into BigQuery once a day, and this cloud function triggers to tidy up.

// Copyright 2021 Genome Research Limited
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

'use strict';

const {BigQuery} = require('@google-cloud/bigquery')

const bq = {
    location: 'EU',
    project: "YOUR_GOOGLE_PROJECT",
    dataset: "YOUR_AWS_DATASET",
    table: "YOUR_AWS_TABLE"
}

async function runQuery() {
    const bigquery = new BigQuery()

    const query = `
    DECLARE max_date TIMESTAMP;
    DECLARE current_bill_period TIMESTAMP;
    SET max_date = (SELECT MAX(_PARTITIONTIME)
      FROM \`${bq.project}.${bq.dataset}.${bq.table}\`);
    SET current_bill_period = (SELECT MAX(bill_billing_period_start_date)
      FROM \`${bq.project}.${bq.dataset}.${bq.table}\`
      WHERE _PARTITIONTIME = max_date);

    DELETE FROM \`${bq.project}.${bq.dataset}.${bq.table}\`
    WHERE
      bill_billing_period_start_date = current_bill_period
      AND _PARTITIONTIME < max_date;
    `
    const options = {
        query: query,
        location: bq.location
    }

    const [job] = await bigquery.createQueryJob(options);
    console.log(`Bigquery delete Job ${job.id} started`)
    const [rows] = await job.getQueryResults()
    console.log("Results:")
    rows.forEach(row => console.log(row))
}

exports.deleteAWSDupBill = (message, context) => {

  const data = JSON.parse(Buffer.from(message.data, 'base64').toString());

  if (data.state === "SUCCEEDED" &&
      data.destinationDatasetId === bq.dataset &&
      data.params.destination_table_name_template === bq.table) {
      runQuery()
  }
};