| point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] |
| class_names = [ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', |
| 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' |
| ] |
| dataset_type = 'NuScenesE2EDataset' |
| data_root = 'data/nuscenes/' |
| input_modality = dict( |
| use_lidar=False, |
| use_camera=True, |
| use_radar=False, |
| use_map=False, |
| use_external=True) |
| file_client_args = dict(backend='disk') |
| train_pipeline = [ |
| dict( |
| type='LoadMultiViewImageFromFilesInCeph', |
| to_float32=True, |
| file_client_args=dict(backend='disk'), |
| img_root=''), |
| dict(type='PhotoMetricDistortionMultiViewImage'), |
| dict( |
| type='LoadAnnotations3D_E2E', |
| with_bbox_3d=True, |
| with_label_3d=True, |
| with_attr_label=False, |
| with_future_anns=True, |
| with_ins_inds_3d=True, |
| ins_inds_add_1=True), |
| dict( |
| type='GenerateOccFlowLabels', |
| grid_conf=dict( |
| xbound=[-50.0, 50.0, 0.5], |
| ybound=[-50.0, 50.0, 0.5], |
| zbound=[-10.0, 10.0, 20.0]), |
| ignore_index=255, |
| only_vehicle=True, |
| filter_invisible=False), |
| dict( |
| type='ObjectRangeFilterTrack', |
| point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]), |
| dict( |
| type='ObjectNameFilterTrack', |
| classes=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' |
| ]), |
| dict( |
| type='NormalizeMultiviewImage', |
| mean=[103.53, 116.28, 123.675], |
| std=[1.0, 1.0, 1.0], |
| to_rgb=False), |
| dict(type='PadMultiViewImage', size_divisor=32), |
| dict( |
| type='DefaultFormatBundle3D', |
| class_names=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' |
| ]), |
| dict( |
| type='CustomCollect3D', |
| keys=[ |
| 'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img', 'timestamp', |
| 'l2g_r_mat', 'l2g_t', 'gt_fut_traj', 'gt_fut_traj_mask', |
| 'gt_past_traj', 'gt_past_traj_mask', 'gt_sdc_bbox', 'gt_sdc_label', |
| 'gt_sdc_fut_traj', 'gt_sdc_fut_traj_mask', 'gt_lane_labels', |
| 'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation', |
| 'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow', |
| 'gt_backward_flow', 'gt_occ_has_invalid_frame', |
| 'gt_occ_img_is_valid', 'gt_future_boxes', 'gt_future_labels', |
| 'sdc_planning', 'sdc_planning_mask', 'command' |
| ]) |
| ] |
| test_pipeline = [ |
| dict( |
| type='LoadMultiViewImageFromFilesInCeph', |
| to_float32=True, |
| file_client_args=dict(backend='disk'), |
| img_root=''), |
| dict( |
| type='NormalizeMultiviewImage', |
| mean=[103.53, 116.28, 123.675], |
| std=[1.0, 1.0, 1.0], |
| to_rgb=False), |
| dict(type='PadMultiViewImage', size_divisor=32), |
| dict( |
| type='LoadAnnotations3D_E2E', |
| with_bbox_3d=False, |
| with_label_3d=False, |
| with_attr_label=False, |
| with_future_anns=True, |
| with_ins_inds_3d=False, |
| ins_inds_add_1=True), |
| dict( |
| type='GenerateOccFlowLabels', |
| grid_conf=dict( |
| xbound=[-50.0, 50.0, 0.5], |
| ybound=[-50.0, 50.0, 0.5], |
| zbound=[-10.0, 10.0, 20.0]), |
| ignore_index=255, |
| only_vehicle=True, |
| filter_invisible=False), |
| dict( |
| type='MultiScaleFlipAug3D', |
| img_scale=(1600, 900), |
| pts_scale_ratio=1, |
| flip=False, |
| transforms=[ |
| dict( |
| type='DefaultFormatBundle3D', |
| class_names=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', |
| 'traffic_cone' |
| ], |
| with_label=False), |
| dict( |
| type='CustomCollect3D', |
| keys=[ |
| 'img', 'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_lane_labels', |
| 'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation', |
| 'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow', |
| 'gt_backward_flow', 'gt_occ_has_invalid_frame', |
| 'gt_occ_img_is_valid', 'sdc_planning', 'sdc_planning_mask', |
| 'command' |
| ]) |
| ]) |
| ] |
| eval_pipeline = [ |
| dict( |
| type='LoadPointsFromFile', |
| coord_type='LIDAR', |
| load_dim=5, |
| use_dim=5, |
| file_client_args=dict(backend='disk')), |
| dict( |
| type='LoadPointsFromMultiSweeps', |
| sweeps_num=10, |
| file_client_args=dict(backend='disk')), |
| dict( |
| type='DefaultFormatBundle3D', |
| class_names=[ |
| 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', |
| 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' |
| ], |
| with_label=False), |
| dict(type='Collect3D', keys=['points']) |
| ] |
| data = dict( |
| samples_per_gpu=1, |
| workers_per_gpu=8, |
| train=dict( |
| type='NuScenesE2EDataset', |
| data_root='data/nuscenes/', |
| ann_file='data/infos/nuscenes_infos_temporal_train.pkl', |
| pipeline=[ |
| dict( |
| type='LoadMultiViewImageFromFilesInCeph', |
| to_float32=True, |
| file_client_args=dict(backend='disk'), |
| img_root=''), |
| dict(type='PhotoMetricDistortionMultiViewImage'), |
| dict( |
| type='LoadAnnotations3D_E2E', |
| with_bbox_3d=True, |
| with_label_3d=True, |
| with_attr_label=False, |
| with_future_anns=True, |
| with_ins_inds_3d=True, |
| ins_inds_add_1=True), |
| dict( |
| type='GenerateOccFlowLabels', |
| grid_conf=dict( |
| xbound=[-50.0, 50.0, 0.5], |
| ybound=[-50.0, 50.0, 0.5], |
| zbound=[-10.0, 10.0, 20.0]), |
| ignore_index=255, |
| only_vehicle=True, |
| filter_invisible=False), |
| dict( |
| type='ObjectRangeFilterTrack', |
| point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]), |
| dict( |
| type='ObjectNameFilterTrack', |
| classes=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', |
| 'traffic_cone' |
| ]), |
| dict( |
| type='NormalizeMultiviewImage', |
| mean=[103.53, 116.28, 123.675], |
| std=[1.0, 1.0, 1.0], |
| to_rgb=False), |
| dict(type='PadMultiViewImage', size_divisor=32), |
| dict( |
| type='DefaultFormatBundle3D', |
| class_names=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', |
| 'traffic_cone' |
| ]), |
| dict( |
| type='CustomCollect3D', |
| keys=[ |
| 'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img', |
| 'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_fut_traj', |
| 'gt_fut_traj_mask', 'gt_past_traj', 'gt_past_traj_mask', |
| 'gt_sdc_bbox', 'gt_sdc_label', 'gt_sdc_fut_traj', |
| 'gt_sdc_fut_traj_mask', 'gt_lane_labels', 'gt_lane_bboxes', |
| 'gt_lane_masks', 'gt_segmentation', 'gt_instance', |
| 'gt_centerness', 'gt_offset', 'gt_flow', |
| 'gt_backward_flow', 'gt_occ_has_invalid_frame', |
| 'gt_occ_img_is_valid', 'gt_future_boxes', |
| 'gt_future_labels', 'sdc_planning', 'sdc_planning_mask', |
| 'command' |
| ]) |
| ], |
| classes=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' |
| ], |
| modality=dict( |
| use_lidar=False, |
| use_camera=True, |
| use_radar=False, |
| use_map=False, |
| use_external=True), |
| test_mode=False, |
| box_type_3d='LiDAR', |
| file_client_args=dict(backend='disk'), |
| use_valid_flag=True, |
| patch_size=[102.4, 102.4], |
| canvas_size=(200, 200), |
| bev_size=(200, 200), |
| queue_length=5, |
| predict_steps=12, |
| past_steps=4, |
| fut_steps=4, |
| use_nonlinear_optimizer=True, |
| occ_receptive_field=3, |
| occ_n_future=6, |
| occ_filter_invalid_sample=False), |
| val=dict( |
| type='NuScenesE2EDataset', |
| data_root='data/nuscenes/', |
| ann_file='data/infos/nuscenes_infos_temporal_val.pkl', |
| pipeline=[ |
| dict( |
| type='LoadMultiViewImageFromFilesInCeph', |
| to_float32=True, |
| file_client_args=dict(backend='disk'), |
| img_root=''), |
| dict( |
| type='NormalizeMultiviewImage', |
| mean=[103.53, 116.28, 123.675], |
| std=[1.0, 1.0, 1.0], |
| to_rgb=False), |
| dict(type='PadMultiViewImage', size_divisor=32), |
| dict( |
| type='LoadAnnotations3D_E2E', |
| with_bbox_3d=False, |
| with_label_3d=False, |
| with_attr_label=False, |
| with_future_anns=True, |
| with_ins_inds_3d=False, |
| ins_inds_add_1=True), |
| dict( |
| type='GenerateOccFlowLabels', |
| grid_conf=dict( |
| xbound=[-50.0, 50.0, 0.5], |
| ybound=[-50.0, 50.0, 0.5], |
| zbound=[-10.0, 10.0, 20.0]), |
| ignore_index=255, |
| only_vehicle=True, |
| filter_invisible=False), |
| dict( |
| type='MultiScaleFlipAug3D', |
| img_scale=(1600, 900), |
| pts_scale_ratio=1, |
| flip=False, |
| transforms=[ |
| dict( |
| type='DefaultFormatBundle3D', |
| class_names=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', |
| 'trailer', 'barrier', 'motorcycle', 'bicycle', |
| 'pedestrian', 'traffic_cone' |
| ], |
| with_label=False), |
| dict( |
| type='CustomCollect3D', |
| keys=[ |
| 'img', 'timestamp', 'l2g_r_mat', 'l2g_t', |
| 'gt_lane_labels', 'gt_lane_bboxes', |
| 'gt_lane_masks', 'gt_segmentation', 'gt_instance', |
| 'gt_centerness', 'gt_offset', 'gt_flow', |
| 'gt_backward_flow', 'gt_occ_has_invalid_frame', |
| 'gt_occ_img_is_valid', 'sdc_planning', |
| 'sdc_planning_mask', 'command' |
| ]) |
| ]) |
| ], |
| classes=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' |
| ], |
| modality=dict( |
| use_lidar=False, |
| use_camera=True, |
| use_radar=False, |
| use_map=False, |
| use_external=True), |
| test_mode=True, |
| box_type_3d='LiDAR', |
| file_client_args=dict(backend='disk'), |
| patch_size=[102.4, 102.4], |
| canvas_size=(200, 200), |
| bev_size=(200, 200), |
| predict_steps=12, |
| past_steps=4, |
| fut_steps=4, |
| use_nonlinear_optimizer=True, |
| samples_per_gpu=1, |
| eval_mod=['det', 'track', 'map'], |
| occ_receptive_field=3, |
| occ_n_future=6, |
| occ_filter_invalid_sample=False), |
| test=dict( |
| type='NuScenesE2EDataset', |
| data_root='data/nuscenes/', |
| ann_file='data/infos/nuscenes_infos_temporal_val.pkl', |
| pipeline=[ |
| dict( |
| type='LoadMultiViewImageFromFilesInCeph', |
| to_float32=True, |
| file_client_args=dict(backend='disk'), |
| img_root=''), |
| dict( |
| type='NormalizeMultiviewImage', |
| mean=[103.53, 116.28, 123.675], |
| std=[1.0, 1.0, 1.0], |
| to_rgb=False), |
| dict(type='PadMultiViewImage', size_divisor=32), |
| dict( |
| type='LoadAnnotations3D_E2E', |
| with_bbox_3d=False, |
| with_label_3d=False, |
| with_attr_label=False, |
| with_future_anns=True, |
| with_ins_inds_3d=False, |
| ins_inds_add_1=True), |
| dict( |
| type='GenerateOccFlowLabels', |
| grid_conf=dict( |
| xbound=[-50.0, 50.0, 0.5], |
| ybound=[-50.0, 50.0, 0.5], |
| zbound=[-10.0, 10.0, 20.0]), |
| ignore_index=255, |
| only_vehicle=True, |
| filter_invisible=False), |
| dict( |
| type='MultiScaleFlipAug3D', |
| img_scale=(1600, 900), |
| pts_scale_ratio=1, |
| flip=False, |
| transforms=[ |
| dict( |
| type='DefaultFormatBundle3D', |
| class_names=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', |
| 'trailer', 'barrier', 'motorcycle', 'bicycle', |
| 'pedestrian', 'traffic_cone' |
| ], |
| with_label=False), |
| dict( |
| type='CustomCollect3D', |
| keys=[ |
| 'img', 'timestamp', 'l2g_r_mat', 'l2g_t', |
| 'gt_lane_labels', 'gt_lane_bboxes', |
| 'gt_lane_masks', 'gt_segmentation', 'gt_instance', |
| 'gt_centerness', 'gt_offset', 'gt_flow', |
| 'gt_backward_flow', 'gt_occ_has_invalid_frame', |
| 'gt_occ_img_is_valid', 'sdc_planning', |
| 'sdc_planning_mask', 'command' |
| ]) |
| ]) |
| ], |
| classes=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', |
| 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' |
| ], |
| modality=dict( |
| use_lidar=False, |
| use_camera=True, |
| use_radar=False, |
| use_map=False, |
| use_external=True), |
| test_mode=True, |
| box_type_3d='LiDAR', |
| file_client_args=dict(backend='disk'), |
| patch_size=[102.4, 102.4], |
| canvas_size=(200, 200), |
| bev_size=(200, 200), |
| predict_steps=12, |
| past_steps=4, |
| fut_steps=4, |
| occ_n_future=6, |
| use_nonlinear_optimizer=True, |
| eval_mod=['det', 'map', 'track']), |
| shuffler_sampler=dict(type='DistributedGroupSampler'), |
| nonshuffler_sampler=dict(type='DistributedSampler')) |
| evaluation = dict( |
| interval=6, |
| pipeline=[ |
| dict( |
| type='LoadMultiViewImageFromFilesInCeph', |
| to_float32=True, |
| file_client_args=dict(backend='disk'), |
| img_root=''), |
| dict( |
| type='NormalizeMultiviewImage', |
| mean=[103.53, 116.28, 123.675], |
| std=[1.0, 1.0, 1.0], |
| to_rgb=False), |
| dict(type='PadMultiViewImage', size_divisor=32), |
| dict( |
| type='LoadAnnotations3D_E2E', |
| with_bbox_3d=False, |
| with_label_3d=False, |
| with_attr_label=False, |
| with_future_anns=True, |
| with_ins_inds_3d=False, |
| ins_inds_add_1=True), |
| dict( |
| type='GenerateOccFlowLabels', |
| grid_conf=dict( |
| xbound=[-50.0, 50.0, 0.5], |
| ybound=[-50.0, 50.0, 0.5], |
| zbound=[-10.0, 10.0, 20.0]), |
| ignore_index=255, |
| only_vehicle=True, |
| filter_invisible=False), |
| dict( |
| type='MultiScaleFlipAug3D', |
| img_scale=(1600, 900), |
| pts_scale_ratio=1, |
| flip=False, |
| transforms=[ |
| dict( |
| type='DefaultFormatBundle3D', |
| class_names=[ |
| 'car', 'truck', 'construction_vehicle', 'bus', |
| 'trailer', 'barrier', 'motorcycle', 'bicycle', |
| 'pedestrian', 'traffic_cone' |
| ], |
| with_label=False), |
| dict( |
| type='CustomCollect3D', |
| keys=[ |
| 'img', 'timestamp', 'l2g_r_mat', 'l2g_t', |
| 'gt_lane_labels', 'gt_lane_bboxes', 'gt_lane_masks', |
| 'gt_segmentation', 'gt_instance', 'gt_centerness', |
| 'gt_offset', 'gt_flow', 'gt_backward_flow', |
| 'gt_occ_has_invalid_frame', 'gt_occ_img_is_valid', |
| 'sdc_planning', 'sdc_planning_mask', 'command' |
| ]) |
| ]) |
| ], |
| planning_evaluation_strategy='uniad') |
| checkpoint_config = dict(interval=1) |
| log_config = dict( |
| interval=10, |
| hooks=[dict(type='TextLoggerHook'), |
| dict(type='TensorboardLoggerHook')]) |
| dist_params = dict(backend='nccl') |
| log_level = 'INFO' |
| work_dir = 'projects/work_dirs/stage1_track_map/base_track_map/' |
| load_from = 'ckpts/bevformer_r101_dcn_24ep.pth' |
| resume_from = None |
| workflow = [('train', 1)] |
| plugin = True |
| plugin_dir = 'projects/mmdet3d_plugin/' |
| voxel_size = [0.2, 0.2, 8] |
| patch_size = [102.4, 102.4] |
| img_norm_cfg = dict( |
| mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False) |
| _dim_ = 256 |
| _pos_dim_ = 128 |
| _ffn_dim_ = 512 |
| _num_levels_ = 4 |
| bev_h_ = 200 |
| bev_w_ = 200 |
| _feed_dim_ = 512 |
| _dim_half_ = 128 |
| canvas_size = (200, 200) |
| queue_length = 5 |
| predict_steps = 12 |
| predict_modes = 6 |
| fut_steps = 4 |
| past_steps = 4 |
| use_nonlinear_optimizer = True |
| occ_n_future = 4 |
| occ_n_future_plan = 6 |
| occ_n_future_max = 6 |
| planning_steps = 6 |
| use_col_optim = True |
| planning_evaluation_strategy = 'uniad' |
| occflow_grid_conf = dict( |
| xbound=[-50.0, 50.0, 0.5], |
| ybound=[-50.0, 50.0, 0.5], |
| zbound=[-10.0, 10.0, 20.0]) |
| train_gt_iou_threshold = 0.3 |
| model = dict( |
| type='UniAD', |
| gt_iou_threshold=0.3, |
| queue_length=5, |
| use_grid_mask=True, |
| video_test_mode=True, |
| num_query=900, |
| num_classes=10, |
| pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], |
| img_backbone=dict( |
| type='ResNet', |
| depth=101, |
| num_stages=4, |
| out_indices=(1, 2, 3), |
| frozen_stages=4, |
| norm_cfg=dict(type='BN2d', requires_grad=False), |
| norm_eval=True, |
| style='caffe', |
| dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False), |
| stage_with_dcn=(False, False, True, True)), |
| img_neck=dict( |
| type='FPN', |
| in_channels=[512, 1024, 2048], |
| out_channels=256, |
| start_level=0, |
| add_extra_convs='on_output', |
| num_outs=4, |
| relu_before_extra_convs=True), |
| freeze_img_backbone=True, |
| freeze_img_neck=False, |
| freeze_bn=False, |
| score_thresh=0.4, |
| filter_score_thresh=0.35, |
| qim_args=dict( |
| qim_type='QIMBase', |
| merger_dropout=0, |
| update_query_pos=True, |
| fp_ratio=0.3, |
| random_drop=0.1), |
| mem_args=dict( |
| memory_bank_type='MemoryBank', |
| memory_bank_score_thresh=0.0, |
| memory_bank_len=4), |
| loss_cfg=dict( |
| type='ClipMatcher', |
| num_classes=10, |
| weight_dict=None, |
| code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], |
| assigner=dict( |
| type='HungarianAssigner3DTrack', |
| cls_cost=dict(type='FocalLossCost', weight=2.0), |
| reg_cost=dict(type='BBox3DL1Cost', weight=0.25), |
| pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]), |
| loss_cls=dict( |
| type='FocalLoss', |
| use_sigmoid=True, |
| gamma=2.0, |
| alpha=0.25, |
| loss_weight=2.0), |
| loss_bbox=dict(type='L1Loss', loss_weight=0.25), |
| loss_past_traj_weight=0.0), |
| pts_bbox_head=dict( |
| type='BEVFormerTrackHead', |
| bev_h=200, |
| bev_w=200, |
| num_query=900, |
| num_classes=10, |
| in_channels=256, |
| sync_cls_avg_factor=True, |
| with_box_refine=True, |
| as_two_stage=False, |
| past_steps=4, |
| fut_steps=4, |
| transformer=dict( |
| type='PerceptionTransformer', |
| rotate_prev_bev=True, |
| use_shift=True, |
| use_can_bus=True, |
| embed_dims=256, |
| encoder=dict( |
| type='BEVFormerEncoder', |
| num_layers=6, |
| pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], |
| num_points_in_pillar=4, |
| return_intermediate=False, |
| transformerlayers=dict( |
| type='BEVFormerLayer', |
| attn_cfgs=[ |
| dict( |
| type='TemporalSelfAttention', |
| embed_dims=256, |
| num_levels=1), |
| dict( |
| type='SpatialCrossAttention', |
| pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], |
| deformable_attention=dict( |
| type='MSDeformableAttention3D', |
| embed_dims=256, |
| num_points=8, |
| num_levels=4), |
| embed_dims=256) |
| ], |
| feedforward_channels=512, |
| ffn_dropout=0.1, |
| operation_order=('self_attn', 'norm', 'cross_attn', 'norm', |
| 'ffn', 'norm'))), |
| decoder=dict( |
| type='DetectionTransformerDecoder', |
| num_layers=6, |
| return_intermediate=True, |
| transformerlayers=dict( |
| type='DetrTransformerDecoderLayer', |
| attn_cfgs=[ |
| dict( |
| type='MultiheadAttention', |
| embed_dims=256, |
| num_heads=8, |
| dropout=0.1), |
| dict( |
| type='CustomMSDeformableAttention', |
| embed_dims=256, |
| num_levels=1) |
| ], |
| feedforward_channels=512, |
| ffn_dropout=0.1, |
| operation_order=('self_attn', 'norm', 'cross_attn', 'norm', |
| 'ffn', 'norm')))), |
| bbox_coder=dict( |
| type='NMSFreeCoder', |
| post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], |
| pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], |
| max_num=300, |
| voxel_size=[0.2, 0.2, 8], |
| num_classes=10), |
| positional_encoding=dict( |
| type='LearnedPositionalEncoding', |
| num_feats=128, |
| row_num_embed=200, |
| col_num_embed=200), |
| loss_cls=dict( |
| type='FocalLoss', |
| use_sigmoid=True, |
| gamma=2.0, |
| alpha=0.25, |
| loss_weight=2.0), |
| loss_bbox=dict(type='L1Loss', loss_weight=0.25), |
| loss_iou=dict(type='GIoULoss', loss_weight=0.0)), |
| seg_head=dict( |
| type='PansegformerHead', |
| bev_h=200, |
| bev_w=200, |
| canvas_size=(200, 200), |
| pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], |
| num_query=300, |
| num_classes=4, |
| num_things_classes=3, |
| num_stuff_classes=1, |
| in_channels=2048, |
| sync_cls_avg_factor=True, |
| as_two_stage=False, |
| with_box_refine=True, |
| transformer=dict( |
| type='SegDeformableTransformer', |
| encoder=dict( |
| type='DetrTransformerEncoder', |
| num_layers=6, |
| transformerlayers=dict( |
| type='BaseTransformerLayer', |
| attn_cfgs=dict( |
| type='MultiScaleDeformableAttention', |
| embed_dims=256, |
| num_levels=4), |
| feedforward_channels=512, |
| ffn_dropout=0.1, |
| operation_order=('self_attn', 'norm', 'ffn', 'norm'))), |
| decoder=dict( |
| type='DeformableDetrTransformerDecoder', |
| num_layers=6, |
| return_intermediate=True, |
| transformerlayers=dict( |
| type='DetrTransformerDecoderLayer', |
| attn_cfgs=[ |
| dict( |
| type='MultiheadAttention', |
| embed_dims=256, |
| num_heads=8, |
| dropout=0.1), |
| dict( |
| type='MultiScaleDeformableAttention', |
| embed_dims=256, |
| num_levels=4) |
| ], |
| feedforward_channels=512, |
| ffn_dropout=0.1, |
| operation_order=('self_attn', 'norm', 'cross_attn', 'norm', |
| 'ffn', 'norm')))), |
| positional_encoding=dict( |
| type='SinePositionalEncoding', |
| num_feats=128, |
| normalize=True, |
| offset=-0.5), |
| loss_cls=dict( |
| type='FocalLoss', |
| use_sigmoid=True, |
| gamma=2.0, |
| alpha=0.25, |
| loss_weight=2.0), |
| loss_bbox=dict(type='L1Loss', loss_weight=5.0), |
| loss_iou=dict(type='GIoULoss', loss_weight=2.0), |
| loss_mask=dict(type='DiceLoss', loss_weight=2.0), |
| thing_transformer_head=dict( |
| type='SegMaskHead', d_model=256, nhead=8, num_decoder_layers=4), |
| stuff_transformer_head=dict( |
| type='SegMaskHead', |
| d_model=256, |
| nhead=8, |
| num_decoder_layers=6, |
| self_attn=True), |
| train_cfg=dict( |
| assigner=dict( |
| type='HungarianAssigner', |
| cls_cost=dict(type='FocalLossCost', weight=2.0), |
| reg_cost=dict( |
| type='BBoxL1Cost', weight=5.0, box_format='xywh'), |
| iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)), |
| assigner_with_mask=dict( |
| type='HungarianAssigner_multi_info', |
| cls_cost=dict(type='FocalLossCost', weight=2.0), |
| reg_cost=dict( |
| type='BBoxL1Cost', weight=5.0, box_format='xywh'), |
| iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0), |
| mask_cost=dict(type='DiceCost', weight=2.0)), |
| sampler=dict(type='PseudoSampler'), |
| sampler_with_mask=dict(type='PseudoSampler_segformer'))), |
| train_cfg=dict( |
| pts=dict( |
| grid_size=[512, 512, 1], |
| voxel_size=[0.2, 0.2, 8], |
| point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], |
| out_size_factor=4, |
| assigner=dict( |
| type='HungarianAssigner3D', |
| cls_cost=dict(type='FocalLossCost', weight=2.0), |
| reg_cost=dict(type='BBox3DL1Cost', weight=0.25), |
| iou_cost=dict(type='IoUCost', weight=0.0), |
| pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0])))) |
| info_root = 'data/infos/' |
| ann_file_train = 'data/infos/nuscenes_infos_temporal_train.pkl' |
| ann_file_val = 'data/infos/nuscenes_infos_temporal_val.pkl' |
| ann_file_test = 'data/infos/nuscenes_infos_temporal_val.pkl' |
| optimizer = dict( |
| type='AdamW', |
| lr=0.0002, |
| paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.1))), |
| weight_decay=0.01) |
| optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) |
| lr_config = dict( |
| policy='CosineAnnealing', |
| warmup='linear', |
| warmup_iters=500, |
| warmup_ratio=0.3333333333333333, |
| min_lr_ratio=0.001) |
| total_epochs = 6 |
| runner = dict(type='EpochBasedRunner', max_epochs=6) |
| find_unused_parameters = True |
| logger_name = 'mmdet' |
| gpu_ids = range(0, 1) |
|
|