DEV Community

Chen Debra
Chen Debra

Posted on

What to Do When DolphinScheduler Fails to Recognize Symbols While Scheduling Complex HiveSQL Tasks

When using Apache DolphinScheduler to schedule complex HiveSQL tasks, HQL might contain various symbols that DolphinScheduler fails to recognize. What should you do in this situation? This article provides feasible solutions and methods for users to reference.

Objective

After parsing complex JSON in Hive, which includes objects and arrays, and arrays within arrays, I thought everything was fine. However, scheduling the task in DolphinScheduler caused major issues and took an entire day to resolve. After trying many methods and exhausting countless brain cells, I finally solved the problem!

HiveSQL

insert  overwrite  table  hurys_dc_dwd.dwd_json_statistics partition(day)
select
        t1.device_no,
        source_device_type,
        sn,
        model,
        create_time,
        cycle,
        get_json_object(coil_list,'$.laneNo')  lane_no,
        get_json_object(coil_list,'$.laneType')           lane_type,
        section_no,
        get_json_object(coil_list,'$.coilNo')             coil_no,
        get_json_object(coil_list,'$.volumeSum')          volume_sum,
        get_json_object(coil_list,'$.volumePerson')       volume_person,
        get_json_object(coil_list,'$.volumeCarNon')       volume_car_non,
        get_json_object(coil_list,'$.volumeCarSmall')     volume_car_small,
        get_json_object(coil_list,'$.volumeCarMiddle')    volume_car_middle,
        get_json_object(coil_list,'$.volumeCarBig')       volume_car_big,
        get_json_object(coil_list,'$.speedAvg')           speed_avg,
        get_json_object(coil_list,'$.speed85')            speed_85,
        get_json_object(coil_list,'$.timeOccupancy')      time_occupancy,
        get_json_object(coil_list,'$.averageHeadway')     average_headway,
        get_json_object(coil_list,'$.averageGap')         average_gap,
        substr(create_time,1,10) day
from (select
       get_json_object(statistics_json,'$.deviceNo')          device_no,
       get_json_object(statistics_json,'$.sourceDeviceType')  source_device_type,
       get_json_object(statistics_json,'$.sn')                sn,
       get_json_object(statistics_json,'$.model')             model,
       get_json_object(statistics_json,'$.createTime')        create_time ,
       get_json_object(statistics_json,'$.data.cycle')        cycle,
       get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.sectionNo') section_no,
       section_list
from hurys_dc_ods.ods_statistics
lateral view explode(split(replace(replace(replace(get_json_object(statistics_json,'$.data.sectionList'),
    '[',''),']',''),'},{"sectionNo"','}|{"sectionNo"'),"\\|")) tf as section_list
    where day='2024-07-18' --  date_sub(current_date(), 1)   -- '2024-07-18' --
    ) as t1
lateral view explode(split(replace(replace(replace(get_json_object(replace(replace(section_list,
    ':{',':[{'),'}}','}]}'),'$.coilList'),'[',''),']',''),'},','}|'),"\\|")) tf1 as coil_list
    where substr(create_time,1,10) =  '2024-07-18' --date_sub(current_date(), 1)   --'2024-07-17'
;
Enter fullscreen mode Exit fullscreen mode

Previous DolphinScheduler Task Scheduling Method

Add HiveSQL statements in a shell script:

#! /bin/bash
source /etc/profile

nowdate=`date --date='0 days ago' "+%Y%m%d"`
yesdate=`date -d yesterday +%Y-%m-%d`

hive -e "
use hurys_dc_dwd;

set hive.vectorized.execution.enabled=false;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=1000;
set hive.exec.max.dynamic.partitions=1500;


with t1 as(
select
       get_json_object(statistics_json,'$.deviceNo')          device_no,
       get_json_object(statistics_json,'$.sourceDeviceType')  source_device_type,
       get_json_object(statistics_json,'$.sn')                sn,
       get_json_object(statistics_json,'$.model')             model,
       get_json_object(statistics_json,'$.createTime')        create_time ,
       get_json_object(statistics_json,'$.data.cycle')        cycle,
       get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.sectionNo') section_no,
       section_list
from hurys_dc_ods.ods_statistics
lateral view explode(split(replace(replace(replace(get_json_object(statistics_json,'$.data.sectionList'),'[',''),']',''),'},{"sectionNo"','}|{"sectionNo"'),"\\\\|")) tf as section_list
    where day='$yesdate'
)
insert  overwrite  table  hurys_dc_dwd.dwd_json_statistics partition(day)
select
        t1.device_no,
        source_device_type,
        sn,
        model,
        substr(create_time,1,19)                          create_time ,
        cycle,
        get_json_object(coil_list,'$.laneNo')  lane_no,
        get_json_object(coil_list,'$.laneType')           lane_type,
        section_no,
        get_json_object(coil_list,'$.coilNo')             coil_no,
        get_json_object(coil_list,'$.volumeSum')          volume_sum,
        get_json_object(coil_list,'$.volumePerson')       volume_person,
        get_json_object(coil_list,'$.volumeCarNon')       volume_car_non,
        get_json_object(coil_list,'$.volumeCarSmall')     volume_car_small,
        get_json_object(coil_list,'$.volumeCarMiddle')    volume_car_middle,
        get_json_object(coil_list,'$.volumeCarBig')       volume_car_big,
        get_json_object(coil_list,'$.speedAvg')           speed_avg,
        get_json_object(coil_list,'$.speed85')            speed_85,
        get_json_object(coil_list,'$.timeOccupancy')      time_occupancy,
        get_json_object(coil_list,'$.averageHeadway')     average_headway,
        get_json_object(coil_list,'$.averageGap')         average_gap,
        substr(create_time,1,10) day
from t1
lateral view explode(split(replace(replace(replace(get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.coilList'),'[',''),']',''),'},','}|'),"\\\\|")) tf1 as coil_list
    where  substr(create_time,1,10) ='$yesdate'
"
Enter fullscreen mode Exit fullscreen mode

Error Logs Caused by the Previous Method

Image description

DolphinScheduler fails to correctly recognize various symbols in HiveSQL when parsing complex JSON.

Solution

Place HiveSQL in a separate SQL file, then execute the Hive source SQL file in the script.

SQL File

  • Use hurys_dc_ods database
    use hurys_dc_dwd;

  • Hive optimization (must execute optimization statements first, or some complex SQL may fail)
    set hive.vectorized.execution.enabled=false;

  • Enable dynamic partitioning (default is true, enabled)
    set hive.exec.dynamic.partition=true;

  • Set to non-strict mode; nonstrict mode allows all partition fields to use dynamic partitioning
    set hive.exec.dynamic.partition.mode=nonstrict;

  • Maximum dynamic partitions that can be created per node in each MR execution
    set hive.exec.max.dynamic.partitions.pernode=1000;

  • Maximum total dynamic partitions that can be created on all MR nodes. Default is 1000
    set hive.exec.max.dynamic.partitions=1500;

insert  overwrite  table  hurys_dc_dwd.dwd_json_statistics partition(day)
select
        t1.device_no,
        source_device_type,
        sn,
        model,
        create_time,
        cycle,
        get_json_object(coil_list,'$.laneNo')  lane_no,
        get_json_object(coil_list,'$.laneType')           lane_type,
        section_no,
        get_json_object(coil_list,'$.coilNo')             coil_no,
        get_json_object(coil_list,'$.volumeSum')          volume_sum,
        get_json_object(coil_list,'$.volumePerson')       volume_person,
        get_json_object(coil_list,'$.volumeCarNon')       volume_car_non,
        get_json_object(coil_list,'$.volumeCarSmall')     volume_car_small,
        get_json_object(coil_list,'$.volumeCarMiddle')    volume_car_middle,
        get_json_object(coil_list,'$.volumeCarBig')       volume_car_big,
        get_json_object(coil_list,'$.speedAvg')           speed_avg,
        get_json_object(coil_list,'$.speed85')            speed_85,
        get_json_object(coil_list,'$.timeOccupancy')      time_occupancy,
        get_json_object(coil_list,'$.averageHeadway')     average_headway,
        get_json_object(coil_list,'$.averageGap')         average_gap,
        substr(create_time,1,10) day
from (select
       get_json_object(statistics_json,'$.deviceNo')          device_no,
       get_json_object(statistics_json,'$.sourceDeviceType')  source_device_type,
       get_json_object(statistics_json

,'$.sn')                sn,
       get_json_object(statistics_json,'$.model')             model,
       get_json_object(statistics_json,'$.createTime')        create_time ,
       get_json_object(statistics_json,'$.data.cycle')        cycle,
       get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.sectionNo') section_no,
       section_list
from hurys_dc_ods.ods_statistics
lateral view explode(split(replace(replace(replace(get_json_object(statistics_json,'$.data.sectionList'),'[',''),']',''),'},{"sectionNo"','}|{"sectionNo"'),"\\|")) tf as section_list
    where day= date_sub(current_date(), 1)
    ) as t1
lateral view explode(split(replace(replace(replace(get_json_object(replace(replace(section_list,':{',':[{'),'}}','}]}'),'$.coilList'),'[',''),']',''),'},','}|'),"\\|")) tf1 as coil_list
where substr(create_time,1,10) =  date_sub(current_date(), 1)
;
Enter fullscreen mode Exit fullscreen mode

DolphinScheduler Task Execution Script

#! /bin/bash
source /etc/profile

nowdate=`date --date='0 days ago' "+%Y-%m-%d"`
yesdate=`date -d yesterday +%Y-%m-%d`

hive -e "
source   dwd_json_statistics.sql
" 
Enter fullscreen mode Exit fullscreen mode

Execute the Task and Verify the Results

Finally, the issue is resolved! If you encounter similar problems where the scheduler can't recognize symbols in SQL, you can use this method. Place the SQL in a SQL file, and then execute this SQL file in the script to avoid such problems.

Top comments (0)