Google Search with Structured Data Extraction

#python #programming #webdev #llm

Introduction

In this blog post, you will be presented with the mechanism on how to perform or accomplish a Google search with the structured data extraction using the Google Gemini Pro Large Language Model.

Hands-on

Please head over to the Google Colab
Make sure to login to the Google Cloud and get the Project Id and Location Info.
Use the below code for Vertex AI initialization purposes.

import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

PROJECT_ID = "<<project_id>>"  # @param {type:"string"}
LOCATION = "<<location>>"  # @param {type:"string"}

if "google.colab" in sys.modules:
    # Define project information
    PROJECT_ID = PROJECT_ID
    LOCATION = LOCATION

    # Initialize Vertex AI
    import vertexai
    vertexai.init(project=PROJECT_ID, location=LOCATION)

We are going to make use of the open source packages like html2text beautifulsoup4 for the web scraping.

!pip install requests html2text beautifulsoup4

Let's work against the search query.

search_query = """Sea food near Googleplex
1600 Amphitheatre Parkway
Mountain View, CA 94043
United States"""

Here's the code for accomplishing the simple web scraping.

import requests
from bs4 import BeautifulSoup
import html2text

def scrape_website(url):
    try:
        # Send an HTTP request to the URL
        response = requests.get(url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            return html2text.html2text(response.text)

        else:
            print(f"Failed to retrieve content. Status code: {response.status_code}")

    except Exception as e:
        print(f"An error occurred: {e}")

For the demonstration purposes, let's do a programmatic Google search and extract the results.

url = f'https://www.google.com/search?q={search_query}'
print(url)
google_search_content = scrape_website(url)

Now let's focus on how to get the structured response with our own schema. Here's the code snippet for the same.

schema = """
  {
    "places": [
      {
        "name": "",
        "rating": <<float>>,
        "price": "",
        "category": "",
        "address": "",
        "city": "",
        "state": "",
        "zip": "",
        "country": "",
        "phone": "",
        "website": ""
      }
    ]
  }
  """

Time for us to deep dive into the Google Gemini Pro usages. Here's the code snippet which is responsible for querying the Gemini Pro model for getting the highly structured response as we expect.

import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part

def google_search_formated_response(content, max_output_tokens=7815):
  model = GenerativeModel("gemini-pro")

  schema = """
  {
    "places": [
      {
        "name": "",
        "rating": <<float>>,
        "price": "",
        "category": "",
        "address": "",
        "city": "",
        "state": "",
        "zip": "",
        "country": "",
        "phone": "",
        "website": ""
      }
    ]
  }
  """

  responses = model.generate_content(
    f"""Format the below response to the following JSON schema.

    Here's the content:

    {content}

    """,
        generation_config={
            "max_output_tokens": max_output_tokens,
            "temperature": 0,
            "top_p": 1
        },
      stream=True,
      )

  formated_response = []

  for response in responses:
      text = response.candidates[0].content.parts[0].text
      print(text)
      formated_response.append(text)

  return formated_response

formated_response = google_search_formated_response(google_search_content)