Apache SeaTunnel

Posted on Jul 2

Deploy Apache SeaTunnel 2.3.11 with Docker: A Complete Guide to Syncing Kafka Data to Hive and Elasticsearch

#apacheseatunnel #elasticsearch #docker #kafka

This guide walks you through the complete process of deploying Apache SeaTunnel 2.3.11 with Docker. It covers everything from environment setup and dependency installation to configuring Kafka virtual tables, data sources, and building end-to-end data synchronization pipelines from Kafka to Hive and Elasticsearch.

Prerequisites

Project Directory Structure

seatunnel-docker/
├── docker-compose.yml              # Main Docker Compose configuration
├── hive/                           # Hive configuration
│   ├── hive-site.xml
│   └── lib/                        # Required dependency JARs
│       └── postgresql-42.5.1.jar
├── init-sql/                       # Database initialization scripts
│   └── seatunnel_server_mysql.sql
├── seatunnel/                      # SeaTunnel server configuration
│   ├── Dockerfile
│   └── apache-seatunnel-2.3.11/    # Extracted SeaTunnel binary package
│       └── lib/                    # Required dependency JARs
│           ├── hive-exec-3.1.3.jar
│           ├── hive-metastore-3.1.3.jar
│           ├── libfb303-0.9.3.jar
│           ├── mysql-connector-java-8.0.28.jar
│           └── seatunnel-hadoop3-3.1.4-uber.jar
└── seatunnel-web/                  # SeaTunnel Web configuration
    ├── Dockerfile
    └── apache-seatunnel-web-1.0.3-bin/  # Extracted SeaTunnel Web package
        └── libs/                   # Required dependency JARs
            └── mysql-connector-java-8.0.28.jar

Download Apache SeaTunnel

# Apache SeaTunnel 2.3.11
https://dlcdn.apache.org/seatunnel/2.3.11/apache-seatunnel-2.3.11-bin.tar.gz

# Build SeaTunnel Web 1.0.3 from source
git clone https://github.com/apache/seatunnel-web.git
cd seatunnel-web
sh build.sh code

Download Required Dependencies

# Required by the Hive Metastore container (PostgreSQL is used as the Hive metastore database)
https://jdbc.postgresql.org/download/postgresql-42.5.1.jar

# Additional dependencies required for Hive synchronization
# (In practice, only the first three JARs are required)
https://repo1.maven.org/maven2/org/apache/hive/hive-exec/3.1.3/hive-exec-3.1.3.jar
https://repo1.maven.org/maven2/org/apache/hive/hive-metastore/3.1.3/hive-metastore-3.1.3.jar
https://repo.maven.apache.org/maven2/org/apache/thrift/libfb303/0.9.3/libfb303-0.9.3.jar
https://repo1.maven.org/maven2/org/apache/thrift/libthrift/0.12.0/libthrift-0.12.0.jar
https://repo1.maven.org/maven2/org/apache/hive/hive-common/3.1.3/hive-common-3.1.3.jar

Create the Project Directory

Place all downloaded packages and configuration files into the seatunnel-docker directory.

mkdir seatunnel-docker
cd seatunnel-docker

Deploy with Docker

Configure `docker-compose.yml`

version: '3.9'

networks:
  seatunnel-network:
    driver: bridge
    ipam:
      config:
        - subnet: 172.16.0.0/24

services:
  # ===== Hive Services =====
  hive-metastore-db:
    image: postgres:15
    container_name: hive-metastore-db
    hostname: hive-metastore-db
    environment:
      POSTGRES_DB: metastore_db
      POSTGRES_USER: hive
      POSTGRES_PASSWORD: hive123456
    ports:
      - "5432:5432"
    volumes:
      - ./hive-metastore-db-data:/var/lib/postgresql/data
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.2
    healthcheck:  # Health check
      test: ["CMD-SHELL", "pg_isready -U hive -d metastore_db"]
      interval: 5s
      timeout: 5s
      retries: 10
      start_period: 10s

  hive-metastore:
    image: apache/hive:4.0.0
    container_name: hive-metastore
    hostname: hive-metastore
    depends_on:
      hive-metastore-db:
        condition: service_healthy  # Wait until the database is healthy
    environment:
      SERVICE_NAME: metastore
      DB_DRIVER: postgres
      SERVICE_OPTS: >-
        -Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
        -Djavax.jdo.option.ConnectionURL=jdbc:postgresql://hive-metastore-db:5432/metastore_db
        -Djavax.jdo.option.ConnectionUserName=hive
        -Djavax.jdo.option.ConnectionPassword=hive123456
    ports:
      - "9083:9083"
    volumes:
      - ./hive/lib/postgresql-42.5.1.jar:/opt/hive/lib/postgresql-42.5.1.jar
      - ./hive/hive-site.xml:/opt/hive/conf/hive-site.xml
      - ./hive-warehouse:/opt/hive/data/warehouse
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.3

  hive-server2:
    image: apache/hive:4.0.0
    container_name: hive-server2
    hostname: hive-server2
    depends_on:
      - hive-metastore
    environment:
      HIVE_SERVER2_THRIFT_PORT: 10000
      SERVICE_NAME: hiveserver2
      IS_RESUME: "true"
      SERVICE_OPTS: "-Dhive.metastore.uris=thrift://hive-metastore:9083"
    ports:
      - "10000:10000"
      - "10002:10002"
    volumes:
      - ./hive-warehouse:/opt/hive/data/warehouse
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.4

  # ===== MySQL =====
  mysql-seatunnel:
    image: mysql:8.0.42
    container_name: mysql-seatunnel
    hostname: mysql-seatunnel
    environment:
      MYSQL_ROOT_PASSWORD: root123456
      MYSQL_DATABASE: seatunnel
      MYSQL_ROOT_HOST: '%'
    ports:
      - "3806:3306"
    volumes:
      - ./mysql_data:/var/lib/mysql
      - ./init-sql:/docker-entrypoint-initdb.d
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.5
    command: --default-authentication-plugin=mysql_native_password
    healthcheck:
      test: ["CMD", "mysqladmin", "ping", "-h", "localhost"]
      interval: 10s
      timeout: 5s
      retries: 5

  # ===== SeaTunnel Services =====
  seatunnel-master:
    build:
      context: ./seatunnel
      dockerfile: Dockerfile
    image: seatunnel:2.3.11
    container_name: seatunnel-master
    hostname: seatunnel-master
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
    command: >
      sh -c "
      cd /opt/seatunnel &&
      exec bin/seatunnel-cluster.sh -r master
      "
    ports:
      - "5801:5801"
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./logs/master:/opt/seatunnel/logs
      # Mount the Hive warehouse directory to ensure data is persisted on the host
      - ./hive-warehouse:/opt/hive/data/warehouse
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.10

  seatunnel-worker1:
    image: seatunnel:2.3.11
    container_name: seatunnel-worker1
    hostname: seatunnel-worker1
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
    command: >
      sh -c "
      cd /opt/seatunnel &&
      exec bin/seatunnel-cluster.sh -r worker
      "
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./logs/worker1:/opt/seatunnel/logs
      # Mount the Hive warehouse directory to ensure data is persisted on the host
      - ./hive-warehouse:/opt/hive/data/warehouse
    depends_on:
      - seatunnel-master
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.11

  seatunnel-worker2:
    image: seatunnel:2.3.11
    container_name: seatunnel-worker2
    hostname: seatunnel-worker2
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
    command: >
      sh -c "
      cd /opt/seatunnel &&
      exec bin/seatunnel-cluster.sh -r worker
      "
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./logs/worker2:/opt/seatunnel/logs
      # Mount the Hive warehouse directory to ensure data is persisted on the host
      - ./hive-warehouse:/opt/hive/data/warehouse
    depends_on:
      - seatunnel-master
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.12

  seatunnel-web:
    build:
      context: ./seatunnel-web
      dockerfile: Dockerfile
    image: seatunnel-web:1.0.3
    container_name: seatunnel-web
    hostname: seatunnel-web
    extra_hosts:
      - "hive-metastore:172.16.0.3"
      - "hive-metastore-db:172.16.0.2"
    environment:
      - SEATUNNEL_HOME=/opt/seatunnel
      - SEATUNNEL_WEB_HOME=/opt/seatunnel-web
    ports:
      - "8801:8801"
    volumes:
      - ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
      - ./seatunnel-web/apache-seatunnel-web-1.0.3-bin/:/opt/seatunnel-web/
      - ./logs/web:/opt/seatunnel-web/logs
      # Mount the Hive warehouse directory to keep the runtime environment consistent
      - ./hive-warehouse:/opt/hive/data/warehouse
    depends_on:
      - seatunnel-master
    networks:
      seatunnel-network:
        ipv4_address: 172.16.0.13

SeaTunnel Configuration

Dockerfile

Create the following Dockerfile for the SeaTunnel service:

FROM eclipse-temurin:8-jdk-ubi9-minimal

WORKDIR /opt/seatunnel/

# Environment variables
ENV SEATUNNEL_HOME=/opt/seatunnel
ENV PATH=$PATH:$SEATUNNEL_HOME/bin

# Expose the cluster communication port
EXPOSE 5801

# Startup command
CMD ["sh", "bin/seatunnel-cluster.sh", "-r", "master"]

Configure `hazelcast-client.yaml`

Edit:

seatunnel/apache-seatunnel-2.3.11/config/hazelcast-client.yaml

Configure the SeaTunnel client to connect to the cluster:

hazelcast-client:
  cluster-name: seatunnel
  properties:
    hazelcast.logging.type: log4j2
  connection-strategy:
    connection-retry:
      cluster-connect-timeout-millis: 3000
  network:
    cluster-members:
      - seatunnel-master:5801

Configure `hazelcast-master.yaml`

Edit:

seatunnel/apache-seatunnel-2.3.11/config/hazelcast-master.yaml

Configure the master node:

hazelcast:
  cluster-name: seatunnel
  network:
    rest-api:
      enabled: false
      endpoint-groups:
        CLUSTER_WRITE:
          enabled: true
        DATA:
          enabled: true
    join:
      tcp-ip:
        enabled: true
        member-list:
          - seatunnel-master:5801
          - seatunnel-worker1:5802
          - seatunnel-worker2:5802
    port:
      auto-increment: false
      port: 5801
  properties:
    hazelcast.invocation.max.retry.count: 20
    hazelcast.tcp.join.port.try.count: 30
    hazelcast.logging.type: log4j2
    hazelcast.operation.generic.thread.count: 50
    hazelcast.heartbeat.failuredetector.type: phi-accrual
    hazelcast.heartbeat.interval.seconds: 2
    hazelcast.max.no.heartbeat.seconds: 180
    hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10
    hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200
    hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100

Configure `hazelcast-worker.yaml`

Edit:

seatunnel/apache-seatunnel-2.3.11/config/hazelcast-worker.yaml

Configure each worker node:

hazelcast:
  cluster-name: seatunnel
  network:
    join:
      tcp-ip:
        enabled: true
        member-list:
          - seatunnel-master:5801
          - seatunnel-worker1:5802
          - seatunnel-worker2:5802
    port:
      auto-increment: false
      port: 5802
  properties:
    hazelcast.invocation.max.retry.count: 20
    hazelcast.tcp.join.port.try.count: 30
    hazelcast.logging.type: log4j2
    hazelcast.operation.generic.thread.count: 50
    hazelcast.heartbeat.failuredetector.type: phi-accrual
    hazelcast.heartbeat.interval.seconds: 2
    hazelcast.max.no.heartbeat.seconds: 180
    hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10
    hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200
    hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100

Install Connector Plugins

If no options appear in the Source component when creating a synchronization job, the required connector plugins have not been installed.

Run the following command to install all supported connector plugins:

cd seatunnel/apache-seatunnel-2.3.11/
sh bin/install-plugin.sh

Hive Configuration

Configure `hive-site.xml`

Edit:

hive/hive-site.xml

Update the configuration as follows:

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://hive-metastore:9083</value>
    </property>

    <property>
        <name>hive.metastore.warehouse.dir</name>
        <value>/opt/hive/data/warehouse</value>
    </property>

    <property>
        <name>metastore.metastore.event.db.notification.api.auth</name>
        <value>false</value>
    </property>
</configuration>

Add Required Dependencies

Place the following JDBC driver in the hive/lib directory.

postgresql-42.5.1.jar

MySQL Configuration

Initialize the Database

Copy the initialization SQL script from the SeaTunnel Web package into the init-sql directory.

cd seatunnel-docker

cp seatunnel-web/apache-seatunnel-web-1.0.3-bin/script/seatunnel_server_mysql.sql \
init-sql/seatunnel_server_mysql.sql

Start the Docker Environment

Build and start all services:

# Build and start all services
docker compose up -d --build

# Open the SeaTunnel Web UI
# Default credentials:
# Username: admin
# Password: admin
open http://localhost:8801

Running Example

After all services have started successfully, log in to the SeaTunnel Web UI and complete the following configuration steps.

Configure the Display Language

Login Page

Open Settings

Change the Language

Select your preferred language from the language settings.

Configure Data Sources

Before creating synchronization jobs, configure the required data sources.

Configure a Kafka Data Source

Create a Kafka connection by providing the cluster address and connection parameters.

(Insert screenshot)

Configure an Elasticsearch Data Source

Configure your Elasticsearch cluster information, including the endpoint and authentication credentials if required.

Configure a Hive Metastore Local Data Source

You can configure the Hive Metastore endpoint using:

thrift://hive-metastore:9083

Configure Virtual Tables

Create a Virtual Table

Follow these steps to create a virtual table:

Navigate to Virtual Tables.
Click Create.
Select an existing data source.
Configure the virtual table properties.
Click Next to define field mappings.
Review the configuration.
Save the virtual table.

Create Synchronization Jobs

Once the data sources and virtual tables are ready, you can build synchronization pipelines using the visual designer.

Kafka → Hive Synchronization

Configure the Job Components

Source

Configure the Kafka source by selecting the previously created Kafka data source.

Field Mapper

Open the Model view to define field mappings between the source and destination schemas.

Sink

Configure Hive as the destination and specify the target database and table.

Kafka → Elasticsearch Synchronization

Configure the Job Components

Source

Configure the Kafka source.

Field Mapper

Configure field mappings in the Model view.

Sink

Configure Elasticsearch as the destination.

Specify the target index and any required connection parameters.

General Workflow for Creating Synchronization Jobs

To create a synchronization job in SeaTunnel Web:

Navigate to Jobs → Synchronization Job Definitions.
Click Create.
Drag or select the Source, Field Mapper, and Sink components to build the pipeline.
Double-click the Source component and select the configured Kafka data source.
Double-click Field Mapper, then open the Model view to configure field mappings.
Double-click the Sink component and configure Hive or Elasticsearch as the destination.
Save the job.
Start the synchronization job.

Important

Before saving the job, make sure to configure the Job Mode.

Otherwise, the job cannot be saved and the following error will be displayed:
job env can't be empty, please change config

Hive Operations

Create a Table

Use one of the following commands to create a table in Hive.

# Access the HiveServer2 container
docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
CREATE TABLE IF NOT EXISTS default.test_user_data3 (
user_id STRING,
type STRING,
content STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
"

Alternatively, you can create the table in Parquet format, which is recommended for better storage efficiency and query performance.

docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
CREATE TABLE IF NOT EXISTS default.test_user_data3 (
user_id STRING,
type STRING,
content STRING
)
STORED AS PARQUET;
"

View the Table Schema

Run the following command to verify that the table has been created successfully.

docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
SHOW TABLES IN default;
DESCRIBE default.test_user_data3;
"

Query Table Data

Run the following command to query the synchronized data stored in Hive.

docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
SELECT * FROM default.test_user_data3 LIMIT 10;
"

Troubleshooting

Hive Metastore URI Parsing Error

If the following exception is reported:

seatunnel seatunnel-web ERROR [qtp2135089262-20] [MetaStoreUtils.logAndThrowMetaException():166] - Got exception: java.net.URISyntaxException Illegal character in hostname at index 44: thrift://hive-metastore.seatunnel-docker_seatunnel-network:9083

Add static hostname mappings to the corresponding services in docker-compose.yml.

extra_hosts:
   - "hive-metastore:172.16.0.3"
   - "hive-metastore-db:172.16.0.2"

Hive Synchronization Fails with `java.lang.NoClassDefFoundError`

If a Hive synchronization job fails with java.lang.NoClassDefFoundError, ensure that the required dependency JARs are available in:

seatunnel/apache-seatunnel-2.3.11/lib

Required dependencies:

hive-exec-3.1.3.jar
hive-metastore-3.1.3.jar
libfb303-0.9.3.jar

Hive Synchronization Job Completes Successfully but No Data Is Written

If the synchronization job completes successfully but no data is written to Hive, verify that the Hive warehouse directory is mounted correctly in docker-compose.yml.

volumes:
  # Mount the Hive warehouse directory to ensure data is persisted on the host
  - ./hive-warehouse:/opt/hive/data/warehouse

Check Which Worker Executes the Job

You can identify which worker node is executing the synchronization job by reviewing the master log:

./logs/master/seatunnel-engine-master.log

Example:

Task [TaskGroupLocation{jobId=1080750681855361026, pipelineId=1, taskGroupId=2}] will be executed on worker [[seatunnel-worker2]:5801], slotID [2], resourceProfile [ResourceProfile{cpu=CPU{core=0}, heapMemory=Memory{bytes=0}}], sequence [db6b679c-67cc-43b8-b64a-acaa85c2a4c0], assigned [1080750681855361026]

Prerequisites

Project Directory Structure

Download Apache SeaTunnel

Download Required Dependencies

Create the Project Directory

Deploy with Docker

Configure docker-compose.yml

SeaTunnel Configuration

Dockerfile

Configure hazelcast-client.yaml

Configure hazelcast-master.yaml

Configure hazelcast-worker.yaml

Install Connector Plugins

Hive Configuration

Configure hive-site.xml

Add Required Dependencies

MySQL Configuration

Initialize the Database

Start the Docker Environment

Running Example

Configure the Display Language

Login Page

Open Settings

Change the Language

Configure Data Sources

Configure a Kafka Data Source

Configure an Elasticsearch Data Source

Configure a Hive Metastore Local Data Source

Configure Virtual Tables

Create a Virtual Table

Create Synchronization Jobs

Kafka → Hive Synchronization

Configure the Job Components

Kafka → Elasticsearch Synchronization

Configure the Job Components

General Workflow for Creating Synchronization Jobs

Hive Operations

Create a Table

View the Table Schema

Query Table Data

Troubleshooting

Hive Metastore URI Parsing Error

Hive Synchronization Fails with java.lang.NoClassDefFoundError

Hive Synchronization Job Completes Successfully but No Data Is Written

Check Which Worker Executes the Job

Configure `docker-compose.yml`

Configure `hazelcast-client.yaml`

Configure `hazelcast-master.yaml`

Configure `hazelcast-worker.yaml`

Configure `hive-site.xml`

Hive Synchronization Fails with `java.lang.NoClassDefFoundError`