Having a large amount of test data sometimes take a lot of effort, and to simulate a more realistic scenario, it’s good to have a large number of tables with distinct column types. This script generates random tables schema for Hive.
If you want to set up a Hive environment for dev and test purposes, take a look at: https://dev.to/mesmacosta/quickly-set-up-a-hive-environment-on-gcp-38j8
Environment
Activate your virtualenv
pip install --upgrade virtualenv
python3 -m virtualenv --python python3 env
source ./env/bin/activate
Install the requirements for the metadata generator
pip install -r requirements.txt
Code
pyhive | |
pandas | |
sasl>=0.2.1 | |
thrift>=0.10.0 | |
git+https://github.com/cloudera/thrift_sasl # Using master branch in order to get Python 3 SASL patches |
import argparse | |
import logging | |
import random | |
import sys | |
import uuid | |
from pyhive import hive | |
_DATA_TYPES = ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE', 'DECIMAL', 'TIMESTAMP', 'DATE', | |
'STRING', 'BOOLEAN', 'BINARY', 'ARRAY<STRUCT< key:STRING, value:STRING>>', | |
'ARRAY <STRING>', 'ARRAY <STRUCT <spouse: STRING, children: ARRAY <STRING>>>', | |
'ARRAY<DOUBLE>', 'MAP<STRING,DOUBLE>', 'STRUCT < employer: STRING, id: BIGINT, address: STRING >', | |
'UNIONTYPE<DOUBLE, STRING, ARRAY<string>, STRUCT<a:INT,b:string>>'] | |
_COLUMN_NAMES = ['name', 'address', 'city', 'state', 'date_time', 'paragraph', 'randomdata', 'person', 'credit_card', | |
'size', 'reason', 'school', 'food', 'location', 'house', 'price', 'cpf', 'cnpj', 'passport', | |
'security_number', 'phone_number', 'bank_account_number', 'ip_address', 'stocks'] | |
_DESCRIPTION_VALUES = ['This is a random generated column', 'Description for random generated column'] | |
_TABLE_NAMES = ['school_info', 'personal_info', 'persons', 'employees', 'companies', 'store', 'home'] | |
_DATABASE_NAMES = ['school_warehouse', 'company_warehouse', 'on_prem_warehouse', 'factory_warehouse', | |
'organization_warehouse'] | |
def get_hive_conn(connection_args): | |
return hive.connect(host=connection_args['host'], | |
port=connection_args['port'], | |
username=connection_args['user'], | |
database=connection_args['database'], | |
auth=None) | |
def create_random_hive_data(connection_args): | |
conn = get_hive_conn(connection_args) | |
cursor = conn.cursor() | |
for x in range(4): | |
database_name, database_stmt = build_create_database_statement() | |
print('\n' + database_stmt) | |
cursor.execute(database_stmt) | |
cursor.execute(build_use_database_statement(database_name)) | |
for y in range(250): | |
table_stmt = build_create_table_statement() | |
cursor.execute(table_stmt) | |
print('\n' + table_stmt) | |
cursor.execute('show databases') | |
databases = cursor.fetchall() | |
print(databases) | |
cursor.close() | |
def get_random_data_type(): | |
return random.choice(_DATA_TYPES) | |
def get_random_databases_name(): | |
return random.choice(_DATABASE_NAMES) | |
def get_random_column_name(): | |
return random.choice(_COLUMN_NAMES) | |
def get_random_column_description(): | |
return random.choice(_DESCRIPTION_VALUES) | |
def get_random_table_name(): | |
return random.choice(_TABLE_NAMES) | |
def build_create_database_statement(): | |
database_name = '{}{}'.format(get_random_databases_name(), | |
str(random.randint(1, 100000))) | |
database_stmt = 'CREATE DATABASE {} '.format(database_name) | |
return database_name, database_stmt | |
def build_use_database_statement(database_name): | |
return 'USE {} '.format(database_name) | |
def build_create_table_statement(): | |
table_stmt = 'CREATE TABLE {}{} ( '.format( | |
get_random_table_name(), | |
uuid.uuid4().hex[:8] | |
) | |
table_stmt = '{}{}{} {}'.format( | |
table_stmt, | |
get_random_column_name(), | |
str(random.randint(1, 100000)), | |
get_random_data_type() | |
) | |
for x in range(random.randint(1, 100)): | |
table_stmt += ' , {}{}'.format(get_random_column_name(), str(random.randint(1, 100000))) + \ | |
' {}'.format(get_random_data_type()) + \ | |
' COMMENT "{}"'.format(get_random_column_description()) | |
table_stmt = '{} )'.format(table_stmt) | |
return table_stmt | |
def parse_args(): | |
parser = argparse.ArgumentParser( | |
description='Command line generate random metadata into a Hive server') | |
parser.add_argument( | |
'--hive-host', | |
help='Your Hive server host', | |
required=True) | |
parser.add_argument('--hive-user', | |
help='Your Hive server user') | |
parser.add_argument('--hive-database', | |
help='Your Hive server database name') | |
parser.add_argument('--hive-port', | |
help='Your Hive server port', | |
type=int, | |
default=1000) | |
return parser.parse_args() | |
if __name__ == "__main__": | |
args = parse_args() | |
# Enable logging | |
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) | |
create_random_hive_data({ | |
'database': args.hive_database, | |
'host': args.hive_host, | |
'user': args.hive_user, | |
'port': args.hive_port | |
}) |
Execution
export HIVE_SERVER=127.0.0.1
export HIVE_USERNAME=hive
export HIVE_PORT=10000
export HIVE_DATABASE=default
python metadata_generator.py \
--hive-host=$HIVE_SERVER \
--hive-user=$HIVE_USERNAME \
--hive-port=$HIVE_PORT \
--hive-database=$HIVE_DATABASE
And that's it!
If you have difficulties, don’t hesitate reaching out. I would love to help you!
Top comments (0)