This guide walks you through the complete process of deploying Apache SeaTunnel 2.3.11 with Docker. It covers everything from environment setup and dependency installation to configuring Kafka virtual tables, data sources, and building end-to-end data synchronization pipelines from Kafka to Hive and Elasticsearch.
Prerequisites
Project Directory Structure
seatunnel-docker/
├── docker-compose.yml # Main Docker Compose configuration
├── hive/ # Hive configuration
│ ├── hive-site.xml
│ └── lib/ # Required dependency JARs
│ └── postgresql-42.5.1.jar
├── init-sql/ # Database initialization scripts
│ └── seatunnel_server_mysql.sql
├── seatunnel/ # SeaTunnel server configuration
│ ├── Dockerfile
│ └── apache-seatunnel-2.3.11/ # Extracted SeaTunnel binary package
│ └── lib/ # Required dependency JARs
│ ├── hive-exec-3.1.3.jar
│ ├── hive-metastore-3.1.3.jar
│ ├── libfb303-0.9.3.jar
│ ├── mysql-connector-java-8.0.28.jar
│ └── seatunnel-hadoop3-3.1.4-uber.jar
└── seatunnel-web/ # SeaTunnel Web configuration
├── Dockerfile
└── apache-seatunnel-web-1.0.3-bin/ # Extracted SeaTunnel Web package
└── libs/ # Required dependency JARs
└── mysql-connector-java-8.0.28.jar
Download Apache SeaTunnel
# Apache SeaTunnel 2.3.11
https://dlcdn.apache.org/seatunnel/2.3.11/apache-seatunnel-2.3.11-bin.tar.gz
# Build SeaTunnel Web 1.0.3 from source
git clone https://github.com/apache/seatunnel-web.git
cd seatunnel-web
sh build.sh code
Download Required Dependencies
# Required by the Hive Metastore container (PostgreSQL is used as the Hive metastore database)
https://jdbc.postgresql.org/download/postgresql-42.5.1.jar
# Additional dependencies required for Hive synchronization
# (In practice, only the first three JARs are required)
https://repo1.maven.org/maven2/org/apache/hive/hive-exec/3.1.3/hive-exec-3.1.3.jar
https://repo1.maven.org/maven2/org/apache/hive/hive-metastore/3.1.3/hive-metastore-3.1.3.jar
https://repo.maven.apache.org/maven2/org/apache/thrift/libfb303/0.9.3/libfb303-0.9.3.jar
https://repo1.maven.org/maven2/org/apache/thrift/libthrift/0.12.0/libthrift-0.12.0.jar
https://repo1.maven.org/maven2/org/apache/hive/hive-common/3.1.3/hive-common-3.1.3.jar
Create the Project Directory
Place all downloaded packages and configuration files into the seatunnel-docker directory.
mkdir seatunnel-docker
cd seatunnel-docker
Deploy with Docker
Configure docker-compose.yml
version: '3.9'
networks:
seatunnel-network:
driver: bridge
ipam:
config:
- subnet: 172.16.0.0/24
services:
# ===== Hive Services =====
hive-metastore-db:
image: postgres:15
container_name: hive-metastore-db
hostname: hive-metastore-db
environment:
POSTGRES_DB: metastore_db
POSTGRES_USER: hive
POSTGRES_PASSWORD: hive123456
ports:
- "5432:5432"
volumes:
- ./hive-metastore-db-data:/var/lib/postgresql/data
networks:
seatunnel-network:
ipv4_address: 172.16.0.2
healthcheck: # Health check
test: ["CMD-SHELL", "pg_isready -U hive -d metastore_db"]
interval: 5s
timeout: 5s
retries: 10
start_period: 10s
hive-metastore:
image: apache/hive:4.0.0
container_name: hive-metastore
hostname: hive-metastore
depends_on:
hive-metastore-db:
condition: service_healthy # Wait until the database is healthy
environment:
SERVICE_NAME: metastore
DB_DRIVER: postgres
SERVICE_OPTS: >-
-Djavax.jdo.option.ConnectionDriverName=org.postgresql.Driver
-Djavax.jdo.option.ConnectionURL=jdbc:postgresql://hive-metastore-db:5432/metastore_db
-Djavax.jdo.option.ConnectionUserName=hive
-Djavax.jdo.option.ConnectionPassword=hive123456
ports:
- "9083:9083"
volumes:
- ./hive/lib/postgresql-42.5.1.jar:/opt/hive/lib/postgresql-42.5.1.jar
- ./hive/hive-site.xml:/opt/hive/conf/hive-site.xml
- ./hive-warehouse:/opt/hive/data/warehouse
networks:
seatunnel-network:
ipv4_address: 172.16.0.3
hive-server2:
image: apache/hive:4.0.0
container_name: hive-server2
hostname: hive-server2
depends_on:
- hive-metastore
environment:
HIVE_SERVER2_THRIFT_PORT: 10000
SERVICE_NAME: hiveserver2
IS_RESUME: "true"
SERVICE_OPTS: "-Dhive.metastore.uris=thrift://hive-metastore:9083"
ports:
- "10000:10000"
- "10002:10002"
volumes:
- ./hive-warehouse:/opt/hive/data/warehouse
networks:
seatunnel-network:
ipv4_address: 172.16.0.4
# ===== MySQL =====
mysql-seatunnel:
image: mysql:8.0.42
container_name: mysql-seatunnel
hostname: mysql-seatunnel
environment:
MYSQL_ROOT_PASSWORD: root123456
MYSQL_DATABASE: seatunnel
MYSQL_ROOT_HOST: '%'
ports:
- "3806:3306"
volumes:
- ./mysql_data:/var/lib/mysql
- ./init-sql:/docker-entrypoint-initdb.d
networks:
seatunnel-network:
ipv4_address: 172.16.0.5
command: --default-authentication-plugin=mysql_native_password
healthcheck:
test: ["CMD", "mysqladmin", "ping", "-h", "localhost"]
interval: 10s
timeout: 5s
retries: 5
# ===== SeaTunnel Services =====
seatunnel-master:
build:
context: ./seatunnel
dockerfile: Dockerfile
image: seatunnel:2.3.11
container_name: seatunnel-master
hostname: seatunnel-master
extra_hosts:
- "hive-metastore:172.16.0.3"
- "hive-metastore-db:172.16.0.2"
environment:
- SEATUNNEL_HOME=/opt/seatunnel
command: >
sh -c "
cd /opt/seatunnel &&
exec bin/seatunnel-cluster.sh -r master
"
ports:
- "5801:5801"
volumes:
- ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
- ./logs/master:/opt/seatunnel/logs
# Mount the Hive warehouse directory to ensure data is persisted on the host
- ./hive-warehouse:/opt/hive/data/warehouse
networks:
seatunnel-network:
ipv4_address: 172.16.0.10
seatunnel-worker1:
image: seatunnel:2.3.11
container_name: seatunnel-worker1
hostname: seatunnel-worker1
extra_hosts:
- "hive-metastore:172.16.0.3"
- "hive-metastore-db:172.16.0.2"
environment:
- SEATUNNEL_HOME=/opt/seatunnel
command: >
sh -c "
cd /opt/seatunnel &&
exec bin/seatunnel-cluster.sh -r worker
"
volumes:
- ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
- ./logs/worker1:/opt/seatunnel/logs
# Mount the Hive warehouse directory to ensure data is persisted on the host
- ./hive-warehouse:/opt/hive/data/warehouse
depends_on:
- seatunnel-master
networks:
seatunnel-network:
ipv4_address: 172.16.0.11
seatunnel-worker2:
image: seatunnel:2.3.11
container_name: seatunnel-worker2
hostname: seatunnel-worker2
extra_hosts:
- "hive-metastore:172.16.0.3"
- "hive-metastore-db:172.16.0.2"
environment:
- SEATUNNEL_HOME=/opt/seatunnel
command: >
sh -c "
cd /opt/seatunnel &&
exec bin/seatunnel-cluster.sh -r worker
"
volumes:
- ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
- ./logs/worker2:/opt/seatunnel/logs
# Mount the Hive warehouse directory to ensure data is persisted on the host
- ./hive-warehouse:/opt/hive/data/warehouse
depends_on:
- seatunnel-master
networks:
seatunnel-network:
ipv4_address: 172.16.0.12
seatunnel-web:
build:
context: ./seatunnel-web
dockerfile: Dockerfile
image: seatunnel-web:1.0.3
container_name: seatunnel-web
hostname: seatunnel-web
extra_hosts:
- "hive-metastore:172.16.0.3"
- "hive-metastore-db:172.16.0.2"
environment:
- SEATUNNEL_HOME=/opt/seatunnel
- SEATUNNEL_WEB_HOME=/opt/seatunnel-web
ports:
- "8801:8801"
volumes:
- ./seatunnel/apache-seatunnel-2.3.11/:/opt/seatunnel/
- ./seatunnel-web/apache-seatunnel-web-1.0.3-bin/:/opt/seatunnel-web/
- ./logs/web:/opt/seatunnel-web/logs
# Mount the Hive warehouse directory to keep the runtime environment consistent
- ./hive-warehouse:/opt/hive/data/warehouse
depends_on:
- seatunnel-master
networks:
seatunnel-network:
ipv4_address: 172.16.0.13
SeaTunnel Configuration
Dockerfile
Create the following Dockerfile for the SeaTunnel service:
FROM eclipse-temurin:8-jdk-ubi9-minimal
WORKDIR /opt/seatunnel/
# Environment variables
ENV SEATUNNEL_HOME=/opt/seatunnel
ENV PATH=$PATH:$SEATUNNEL_HOME/bin
# Expose the cluster communication port
EXPOSE 5801
# Startup command
CMD ["sh", "bin/seatunnel-cluster.sh", "-r", "master"]
Configure hazelcast-client.yaml
Edit:
seatunnel/apache-seatunnel-2.3.11/config/hazelcast-client.yaml
Configure the SeaTunnel client to connect to the cluster:
hazelcast-client:
cluster-name: seatunnel
properties:
hazelcast.logging.type: log4j2
connection-strategy:
connection-retry:
cluster-connect-timeout-millis: 3000
network:
cluster-members:
- seatunnel-master:5801
Configure hazelcast-master.yaml
Edit:
seatunnel/apache-seatunnel-2.3.11/config/hazelcast-master.yaml
Configure the master node:
hazelcast:
cluster-name: seatunnel
network:
rest-api:
enabled: false
endpoint-groups:
CLUSTER_WRITE:
enabled: true
DATA:
enabled: true
join:
tcp-ip:
enabled: true
member-list:
- seatunnel-master:5801
- seatunnel-worker1:5802
- seatunnel-worker2:5802
port:
auto-increment: false
port: 5801
properties:
hazelcast.invocation.max.retry.count: 20
hazelcast.tcp.join.port.try.count: 30
hazelcast.logging.type: log4j2
hazelcast.operation.generic.thread.count: 50
hazelcast.heartbeat.failuredetector.type: phi-accrual
hazelcast.heartbeat.interval.seconds: 2
hazelcast.max.no.heartbeat.seconds: 180
hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10
hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200
hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100
Configure hazelcast-worker.yaml
Edit:
seatunnel/apache-seatunnel-2.3.11/config/hazelcast-worker.yaml
Configure each worker node:
hazelcast:
cluster-name: seatunnel
network:
join:
tcp-ip:
enabled: true
member-list:
- seatunnel-master:5801
- seatunnel-worker1:5802
- seatunnel-worker2:5802
port:
auto-increment: false
port: 5802
properties:
hazelcast.invocation.max.retry.count: 20
hazelcast.tcp.join.port.try.count: 30
hazelcast.logging.type: log4j2
hazelcast.operation.generic.thread.count: 50
hazelcast.heartbeat.failuredetector.type: phi-accrual
hazelcast.heartbeat.interval.seconds: 2
hazelcast.max.no.heartbeat.seconds: 180
hazelcast.heartbeat.phiaccrual.failuredetector.threshold: 10
hazelcast.heartbeat.phiaccrual.failuredetector.sample.size: 200
hazelcast.heartbeat.phiaccrual.failuredetector.min.std.dev.millis: 100
Install Connector Plugins
If no options appear in the Source component when creating a synchronization job, the required connector plugins have not been installed.
Run the following command to install all supported connector plugins:
cd seatunnel/apache-seatunnel-2.3.11/
sh bin/install-plugin.sh
Hive Configuration
Configure hive-site.xml
Edit:
hive/hive-site.xml
Update the configuration as follows:
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<property>
<name>hive.metastore.uris</name>
<value>thrift://hive-metastore:9083</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/opt/hive/data/warehouse</value>
</property>
<property>
<name>metastore.metastore.event.db.notification.api.auth</name>
<value>false</value>
</property>
</configuration>
Add Required Dependencies
Place the following JDBC driver in the hive/lib directory.
postgresql-42.5.1.jar
MySQL Configuration
Initialize the Database
Copy the initialization SQL script from the SeaTunnel Web package into the init-sql directory.
cd seatunnel-docker
cp seatunnel-web/apache-seatunnel-web-1.0.3-bin/script/seatunnel_server_mysql.sql \
init-sql/seatunnel_server_mysql.sql
Start the Docker Environment
Build and start all services:
# Build and start all services
docker compose up -d --build
# Open the SeaTunnel Web UI
# Default credentials:
# Username: admin
# Password: admin
open http://localhost:8801
Running Example
After all services have started successfully, log in to the SeaTunnel Web UI and complete the following configuration steps.
Configure the Display Language
Login Page
Open Settings
Change the Language
Select your preferred language from the language settings.
Configure Data Sources
Before creating synchronization jobs, configure the required data sources.
Configure a Kafka Data Source
Create a Kafka connection by providing the cluster address and connection parameters.
(Insert screenshot)
Configure an Elasticsearch Data Source
Configure your Elasticsearch cluster information, including the endpoint and authentication credentials if required.
Configure a Hive Metastore Local Data Source
You can configure the Hive Metastore endpoint using:
thrift://hive-metastore:9083
Configure Virtual Tables
Create a Virtual Table
Follow these steps to create a virtual table:
- Navigate to Virtual Tables.
- Click Create.
- Select an existing data source.
- Configure the virtual table properties.
- Click Next to define field mappings.
- Review the configuration.
- Save the virtual table.
Create Synchronization Jobs
Once the data sources and virtual tables are ready, you can build synchronization pipelines using the visual designer.
Kafka → Hive Synchronization
Configure the Job Components
- Source
Configure the Kafka source by selecting the previously created Kafka data source.
- Field Mapper
Open the Model view to define field mappings between the source and destination schemas.
- Sink
Configure Hive as the destination and specify the target database and table.
Kafka → Elasticsearch Synchronization
Configure the Job Components
- Source
Configure the Kafka source.
- Field Mapper
Configure field mappings in the Model view.
- Sink
Configure Elasticsearch as the destination.
Specify the target index and any required connection parameters.
General Workflow for Creating Synchronization Jobs
To create a synchronization job in SeaTunnel Web:
- Navigate to Jobs → Synchronization Job Definitions.
- Click Create.
- Drag or select the Source, Field Mapper, and Sink components to build the pipeline.
- Double-click the Source component and select the configured Kafka data source.
- Double-click Field Mapper, then open the Model view to configure field mappings.
- Double-click the Sink component and configure Hive or Elasticsearch as the destination.
- Save the job.
- Start the synchronization job.
Important
Before saving the job, make sure to configure the Job Mode.
Otherwise, the job cannot be saved and the following error will be displayed:
job env can't be empty, please change config
Hive Operations
Create a Table
Use one of the following commands to create a table in Hive.
# Access the HiveServer2 container
docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
CREATE TABLE IF NOT EXISTS default.test_user_data3 (
user_id STRING,
type STRING,
content STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
"
Alternatively, you can create the table in Parquet format, which is recommended for better storage efficiency and query performance.
docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
CREATE TABLE IF NOT EXISTS default.test_user_data3 (
user_id STRING,
type STRING,
content STRING
)
STORED AS PARQUET;
"
View the Table Schema
Run the following command to verify that the table has been created successfully.
docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
SHOW TABLES IN default;
DESCRIBE default.test_user_data3;
"
Query Table Data
Run the following command to query the synchronized data stored in Hive.
docker exec -it hive-server2 beeline -u jdbc:hive2://localhost:10000 -e "
SELECT * FROM default.test_user_data3 LIMIT 10;
"
Troubleshooting
Hive Metastore URI Parsing Error
If the following exception is reported:
seatunnel seatunnel-web ERROR [qtp2135089262-20] [MetaStoreUtils.logAndThrowMetaException():166] - Got exception: java.net.URISyntaxException Illegal character in hostname at index 44: thrift://hive-metastore.seatunnel-docker_seatunnel-network:9083
Add static hostname mappings to the corresponding services in docker-compose.yml.
extra_hosts:
- "hive-metastore:172.16.0.3"
- "hive-metastore-db:172.16.0.2"
Hive Synchronization Fails with java.lang.NoClassDefFoundError
If a Hive synchronization job fails with java.lang.NoClassDefFoundError, ensure that the required dependency JARs are available in:
seatunnel/apache-seatunnel-2.3.11/lib
Required dependencies:
hive-exec-3.1.3.jar
hive-metastore-3.1.3.jar
libfb303-0.9.3.jar
Hive Synchronization Job Completes Successfully but No Data Is Written
If the synchronization job completes successfully but no data is written to Hive, verify that the Hive warehouse directory is mounted correctly in docker-compose.yml.
volumes:
# Mount the Hive warehouse directory to ensure data is persisted on the host
- ./hive-warehouse:/opt/hive/data/warehouse
Check Which Worker Executes the Job
You can identify which worker node is executing the synchronization job by reviewing the master log:
./logs/master/seatunnel-engine-master.log
Example:
Task [TaskGroupLocation{jobId=1080750681855361026, pipelineId=1, taskGroupId=2}] will be executed on worker [[seatunnel-worker2]:5801], slotID [2], resourceProfile [ResourceProfile{cpu=CPU{core=0}, heapMemory=Memory{bytes=0}}], sequence [db6b679c-67cc-43b8-b64a-acaa85c2a4c0], assigned [1080750681855361026]


Top comments (0)