UniAD2.0_R101_nuScenes / config /base_track_map.py

UniAD V2.0 training config file

39bf596 verified 6 months ago

28.8 kB

	point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
	class_names = [
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
	'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	]
	dataset_type = 'NuScenesE2EDataset'
	data_root = 'data/nuscenes/'
	input_modality = dict(
	use_lidar=False,
	use_camera=True,
	use_radar=False,
	use_map=False,
	use_external=True)
	file_client_args = dict(backend='disk')
	train_pipeline = [
	dict(
	type='LoadMultiViewImageFromFilesInCeph',
	to_float32=True,
	file_client_args=dict(backend='disk'),
	img_root=''),
	dict(type='PhotoMetricDistortionMultiViewImage'),
	dict(
	type='LoadAnnotations3D_E2E',
	with_bbox_3d=True,
	with_label_3d=True,
	with_attr_label=False,
	with_future_anns=True,
	with_ins_inds_3d=True,
	ins_inds_add_1=True),
	dict(
	type='GenerateOccFlowLabels',
	grid_conf=dict(
	xbound=[-50.0, 50.0, 0.5],
	ybound=[-50.0, 50.0, 0.5],
	zbound=[-10.0, 10.0, 20.0]),
	ignore_index=255,
	only_vehicle=True,
	filter_invisible=False),
	dict(
	type='ObjectRangeFilterTrack',
	point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
	dict(
	type='ObjectNameFilterTrack',
	classes=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	]),
	dict(
	type='NormalizeMultiviewImage',
	mean=[103.53, 116.28, 123.675],
	std=[1.0, 1.0, 1.0],
	to_rgb=False),
	dict(type='PadMultiViewImage', size_divisor=32),
	dict(
	type='DefaultFormatBundle3D',
	class_names=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	]),
	dict(
	type='CustomCollect3D',
	keys=[
	'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img', 'timestamp',
	'l2g_r_mat', 'l2g_t', 'gt_fut_traj', 'gt_fut_traj_mask',
	'gt_past_traj', 'gt_past_traj_mask', 'gt_sdc_bbox', 'gt_sdc_label',
	'gt_sdc_fut_traj', 'gt_sdc_fut_traj_mask', 'gt_lane_labels',
	'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation',
	'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow',
	'gt_backward_flow', 'gt_occ_has_invalid_frame',
	'gt_occ_img_is_valid', 'gt_future_boxes', 'gt_future_labels',
	'sdc_planning', 'sdc_planning_mask', 'command'
	])
	]
	test_pipeline = [
	dict(
	type='LoadMultiViewImageFromFilesInCeph',
	to_float32=True,
	file_client_args=dict(backend='disk'),
	img_root=''),
	dict(
	type='NormalizeMultiviewImage',
	mean=[103.53, 116.28, 123.675],
	std=[1.0, 1.0, 1.0],
	to_rgb=False),
	dict(type='PadMultiViewImage', size_divisor=32),
	dict(
	type='LoadAnnotations3D_E2E',
	with_bbox_3d=False,
	with_label_3d=False,
	with_attr_label=False,
	with_future_anns=True,
	with_ins_inds_3d=False,
	ins_inds_add_1=True),
	dict(
	type='GenerateOccFlowLabels',
	grid_conf=dict(
	xbound=[-50.0, 50.0, 0.5],
	ybound=[-50.0, 50.0, 0.5],
	zbound=[-10.0, 10.0, 20.0]),
	ignore_index=255,
	only_vehicle=True,
	filter_invisible=False),
	dict(
	type='MultiScaleFlipAug3D',
	img_scale=(1600, 900),
	pts_scale_ratio=1,
	flip=False,
	transforms=[
	dict(
	type='DefaultFormatBundle3D',
	class_names=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian',
	'traffic_cone'
	],
	with_label=False),
	dict(
	type='CustomCollect3D',
	keys=[
	'img', 'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_lane_labels',
	'gt_lane_bboxes', 'gt_lane_masks', 'gt_segmentation',
	'gt_instance', 'gt_centerness', 'gt_offset', 'gt_flow',
	'gt_backward_flow', 'gt_occ_has_invalid_frame',
	'gt_occ_img_is_valid', 'sdc_planning', 'sdc_planning_mask',
	'command'
	])
	])
	]
	eval_pipeline = [
	dict(
	type='LoadPointsFromFile',
	coord_type='LIDAR',
	load_dim=5,
	use_dim=5,
	file_client_args=dict(backend='disk')),
	dict(
	type='LoadPointsFromMultiSweeps',
	sweeps_num=10,
	file_client_args=dict(backend='disk')),
	dict(
	type='DefaultFormatBundle3D',
	class_names=[
	'car', 'truck', 'trailer', 'bus', 'construction_vehicle',
	'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
	],
	with_label=False),
	dict(type='Collect3D', keys=['points'])
	]
	data = dict(
	samples_per_gpu=1,
	workers_per_gpu=8,
	train=dict(
	type='NuScenesE2EDataset',
	data_root='data/nuscenes/',
	ann_file='data/infos/nuscenes_infos_temporal_train.pkl',
	pipeline=[
	dict(
	type='LoadMultiViewImageFromFilesInCeph',
	to_float32=True,
	file_client_args=dict(backend='disk'),
	img_root=''),
	dict(type='PhotoMetricDistortionMultiViewImage'),
	dict(
	type='LoadAnnotations3D_E2E',
	with_bbox_3d=True,
	with_label_3d=True,
	with_attr_label=False,
	with_future_anns=True,
	with_ins_inds_3d=True,
	ins_inds_add_1=True),
	dict(
	type='GenerateOccFlowLabels',
	grid_conf=dict(
	xbound=[-50.0, 50.0, 0.5],
	ybound=[-50.0, 50.0, 0.5],
	zbound=[-10.0, 10.0, 20.0]),
	ignore_index=255,
	only_vehicle=True,
	filter_invisible=False),
	dict(
	type='ObjectRangeFilterTrack',
	point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
	dict(
	type='ObjectNameFilterTrack',
	classes=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian',
	'traffic_cone'
	]),
	dict(
	type='NormalizeMultiviewImage',
	mean=[103.53, 116.28, 123.675],
	std=[1.0, 1.0, 1.0],
	to_rgb=False),
	dict(type='PadMultiViewImage', size_divisor=32),
	dict(
	type='DefaultFormatBundle3D',
	class_names=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian',
	'traffic_cone'
	]),
	dict(
	type='CustomCollect3D',
	keys=[
	'gt_bboxes_3d', 'gt_labels_3d', 'gt_inds', 'img',
	'timestamp', 'l2g_r_mat', 'l2g_t', 'gt_fut_traj',
	'gt_fut_traj_mask', 'gt_past_traj', 'gt_past_traj_mask',
	'gt_sdc_bbox', 'gt_sdc_label', 'gt_sdc_fut_traj',
	'gt_sdc_fut_traj_mask', 'gt_lane_labels', 'gt_lane_bboxes',
	'gt_lane_masks', 'gt_segmentation', 'gt_instance',
	'gt_centerness', 'gt_offset', 'gt_flow',
	'gt_backward_flow', 'gt_occ_has_invalid_frame',
	'gt_occ_img_is_valid', 'gt_future_boxes',
	'gt_future_labels', 'sdc_planning', 'sdc_planning_mask',
	'command'
	])
	],
	classes=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	],
	modality=dict(
	use_lidar=False,
	use_camera=True,
	use_radar=False,
	use_map=False,
	use_external=True),
	test_mode=False,
	box_type_3d='LiDAR',
	file_client_args=dict(backend='disk'),
	use_valid_flag=True,
	patch_size=[102.4, 102.4],
	canvas_size=(200, 200),
	bev_size=(200, 200),
	queue_length=5,
	predict_steps=12,
	past_steps=4,
	fut_steps=4,
	use_nonlinear_optimizer=True,
	occ_receptive_field=3,
	occ_n_future=6,
	occ_filter_invalid_sample=False),
	val=dict(
	type='NuScenesE2EDataset',
	data_root='data/nuscenes/',
	ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
	pipeline=[
	dict(
	type='LoadMultiViewImageFromFilesInCeph',
	to_float32=True,
	file_client_args=dict(backend='disk'),
	img_root=''),
	dict(
	type='NormalizeMultiviewImage',
	mean=[103.53, 116.28, 123.675],
	std=[1.0, 1.0, 1.0],
	to_rgb=False),
	dict(type='PadMultiViewImage', size_divisor=32),
	dict(
	type='LoadAnnotations3D_E2E',
	with_bbox_3d=False,
	with_label_3d=False,
	with_attr_label=False,
	with_future_anns=True,
	with_ins_inds_3d=False,
	ins_inds_add_1=True),
	dict(
	type='GenerateOccFlowLabels',
	grid_conf=dict(
	xbound=[-50.0, 50.0, 0.5],
	ybound=[-50.0, 50.0, 0.5],
	zbound=[-10.0, 10.0, 20.0]),
	ignore_index=255,
	only_vehicle=True,
	filter_invisible=False),
	dict(
	type='MultiScaleFlipAug3D',
	img_scale=(1600, 900),
	pts_scale_ratio=1,
	flip=False,
	transforms=[
	dict(
	type='DefaultFormatBundle3D',
	class_names=[
	'car', 'truck', 'construction_vehicle', 'bus',
	'trailer', 'barrier', 'motorcycle', 'bicycle',
	'pedestrian', 'traffic_cone'
	],
	with_label=False),
	dict(
	type='CustomCollect3D',
	keys=[
	'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
	'gt_lane_labels', 'gt_lane_bboxes',
	'gt_lane_masks', 'gt_segmentation', 'gt_instance',
	'gt_centerness', 'gt_offset', 'gt_flow',
	'gt_backward_flow', 'gt_occ_has_invalid_frame',
	'gt_occ_img_is_valid', 'sdc_planning',
	'sdc_planning_mask', 'command'
	])
	])
	],
	classes=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	],
	modality=dict(
	use_lidar=False,
	use_camera=True,
	use_radar=False,
	use_map=False,
	use_external=True),
	test_mode=True,
	box_type_3d='LiDAR',
	file_client_args=dict(backend='disk'),
	patch_size=[102.4, 102.4],
	canvas_size=(200, 200),
	bev_size=(200, 200),
	predict_steps=12,
	past_steps=4,
	fut_steps=4,
	use_nonlinear_optimizer=True,
	samples_per_gpu=1,
	eval_mod=['det', 'track', 'map'],
	occ_receptive_field=3,
	occ_n_future=6,
	occ_filter_invalid_sample=False),
	test=dict(
	type='NuScenesE2EDataset',
	data_root='data/nuscenes/',
	ann_file='data/infos/nuscenes_infos_temporal_val.pkl',
	pipeline=[
	dict(
	type='LoadMultiViewImageFromFilesInCeph',
	to_float32=True,
	file_client_args=dict(backend='disk'),
	img_root=''),
	dict(
	type='NormalizeMultiviewImage',
	mean=[103.53, 116.28, 123.675],
	std=[1.0, 1.0, 1.0],
	to_rgb=False),
	dict(type='PadMultiViewImage', size_divisor=32),
	dict(
	type='LoadAnnotations3D_E2E',
	with_bbox_3d=False,
	with_label_3d=False,
	with_attr_label=False,
	with_future_anns=True,
	with_ins_inds_3d=False,
	ins_inds_add_1=True),
	dict(
	type='GenerateOccFlowLabels',
	grid_conf=dict(
	xbound=[-50.0, 50.0, 0.5],
	ybound=[-50.0, 50.0, 0.5],
	zbound=[-10.0, 10.0, 20.0]),
	ignore_index=255,
	only_vehicle=True,
	filter_invisible=False),
	dict(
	type='MultiScaleFlipAug3D',
	img_scale=(1600, 900),
	pts_scale_ratio=1,
	flip=False,
	transforms=[
	dict(
	type='DefaultFormatBundle3D',
	class_names=[
	'car', 'truck', 'construction_vehicle', 'bus',
	'trailer', 'barrier', 'motorcycle', 'bicycle',
	'pedestrian', 'traffic_cone'
	],
	with_label=False),
	dict(
	type='CustomCollect3D',
	keys=[
	'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
	'gt_lane_labels', 'gt_lane_bboxes',
	'gt_lane_masks', 'gt_segmentation', 'gt_instance',
	'gt_centerness', 'gt_offset', 'gt_flow',
	'gt_backward_flow', 'gt_occ_has_invalid_frame',
	'gt_occ_img_is_valid', 'sdc_planning',
	'sdc_planning_mask', 'command'
	])
	])
	],
	classes=[
	'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
	'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
	],
	modality=dict(
	use_lidar=False,
	use_camera=True,
	use_radar=False,
	use_map=False,
	use_external=True),
	test_mode=True,
	box_type_3d='LiDAR',
	file_client_args=dict(backend='disk'),
	patch_size=[102.4, 102.4],
	canvas_size=(200, 200),
	bev_size=(200, 200),
	predict_steps=12,
	past_steps=4,
	fut_steps=4,
	occ_n_future=6,
	use_nonlinear_optimizer=True,
	eval_mod=['det', 'map', 'track']),
	shuffler_sampler=dict(type='DistributedGroupSampler'),
	nonshuffler_sampler=dict(type='DistributedSampler'))
	evaluation = dict(
	interval=6,
	pipeline=[
	dict(
	type='LoadMultiViewImageFromFilesInCeph',
	to_float32=True,
	file_client_args=dict(backend='disk'),
	img_root=''),
	dict(
	type='NormalizeMultiviewImage',
	mean=[103.53, 116.28, 123.675],
	std=[1.0, 1.0, 1.0],
	to_rgb=False),
	dict(type='PadMultiViewImage', size_divisor=32),
	dict(
	type='LoadAnnotations3D_E2E',
	with_bbox_3d=False,
	with_label_3d=False,
	with_attr_label=False,
	with_future_anns=True,
	with_ins_inds_3d=False,
	ins_inds_add_1=True),
	dict(
	type='GenerateOccFlowLabels',
	grid_conf=dict(
	xbound=[-50.0, 50.0, 0.5],
	ybound=[-50.0, 50.0, 0.5],
	zbound=[-10.0, 10.0, 20.0]),
	ignore_index=255,
	only_vehicle=True,
	filter_invisible=False),
	dict(
	type='MultiScaleFlipAug3D',
	img_scale=(1600, 900),
	pts_scale_ratio=1,
	flip=False,
	transforms=[
	dict(
	type='DefaultFormatBundle3D',
	class_names=[
	'car', 'truck', 'construction_vehicle', 'bus',
	'trailer', 'barrier', 'motorcycle', 'bicycle',
	'pedestrian', 'traffic_cone'
	],
	with_label=False),
	dict(
	type='CustomCollect3D',
	keys=[
	'img', 'timestamp', 'l2g_r_mat', 'l2g_t',
	'gt_lane_labels', 'gt_lane_bboxes', 'gt_lane_masks',
	'gt_segmentation', 'gt_instance', 'gt_centerness',
	'gt_offset', 'gt_flow', 'gt_backward_flow',
	'gt_occ_has_invalid_frame', 'gt_occ_img_is_valid',
	'sdc_planning', 'sdc_planning_mask', 'command'
	])
	])
	],
	planning_evaluation_strategy='uniad')
	checkpoint_config = dict(interval=1)
	log_config = dict(
	interval=10,
	hooks=[dict(type='TextLoggerHook'),
	dict(type='TensorboardLoggerHook')])
	dist_params = dict(backend='nccl')
	log_level = 'INFO'
	work_dir = 'projects/work_dirs/stage1_track_map/base_track_map/'
	load_from = 'ckpts/bevformer_r101_dcn_24ep.pth'
	resume_from = None
	workflow = [('train', 1)]
	plugin = True
	plugin_dir = 'projects/mmdet3d_plugin/'
	voxel_size = [0.2, 0.2, 8]
	patch_size = [102.4, 102.4]
	img_norm_cfg = dict(
	mean=[103.53, 116.28, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
	_dim_ = 256
	_pos_dim_ = 128
	_ffn_dim_ = 512
	_num_levels_ = 4
	bev_h_ = 200
	bev_w_ = 200
	_feed_dim_ = 512
	_dim_half_ = 128
	canvas_size = (200, 200)
	queue_length = 5
	predict_steps = 12
	predict_modes = 6
	fut_steps = 4
	past_steps = 4
	use_nonlinear_optimizer = True
	occ_n_future = 4
	occ_n_future_plan = 6
	occ_n_future_max = 6
	planning_steps = 6
	use_col_optim = True
	planning_evaluation_strategy = 'uniad'
	occflow_grid_conf = dict(
	xbound=[-50.0, 50.0, 0.5],
	ybound=[-50.0, 50.0, 0.5],
	zbound=[-10.0, 10.0, 20.0])
	train_gt_iou_threshold = 0.3
	model = dict(
	type='UniAD',
	gt_iou_threshold=0.3,
	queue_length=5,
	use_grid_mask=True,
	video_test_mode=True,
	num_query=900,
	num_classes=10,
	pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
	img_backbone=dict(
	type='ResNet',
	depth=101,
	num_stages=4,
	out_indices=(1, 2, 3),
	frozen_stages=4,
	norm_cfg=dict(type='BN2d', requires_grad=False),
	norm_eval=True,
	style='caffe',
	dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
	stage_with_dcn=(False, False, True, True)),
	img_neck=dict(
	type='FPN',
	in_channels=[512, 1024, 2048],
	out_channels=256,
	start_level=0,
	add_extra_convs='on_output',
	num_outs=4,
	relu_before_extra_convs=True),
	freeze_img_backbone=True,
	freeze_img_neck=False,
	freeze_bn=False,
	score_thresh=0.4,
	filter_score_thresh=0.35,
	qim_args=dict(
	qim_type='QIMBase',
	merger_dropout=0,
	update_query_pos=True,
	fp_ratio=0.3,
	random_drop=0.1),
	mem_args=dict(
	memory_bank_type='MemoryBank',
	memory_bank_score_thresh=0.0,
	memory_bank_len=4),
	loss_cfg=dict(
	type='ClipMatcher',
	num_classes=10,
	weight_dict=None,
	code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
	assigner=dict(
	type='HungarianAssigner3DTrack',
	cls_cost=dict(type='FocalLossCost', weight=2.0),
	reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
	pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]),
	loss_cls=dict(
	type='FocalLoss',
	use_sigmoid=True,
	gamma=2.0,
	alpha=0.25,
	loss_weight=2.0),
	loss_bbox=dict(type='L1Loss', loss_weight=0.25),
	loss_past_traj_weight=0.0),
	pts_bbox_head=dict(
	type='BEVFormerTrackHead',
	bev_h=200,
	bev_w=200,
	num_query=900,
	num_classes=10,
	in_channels=256,
	sync_cls_avg_factor=True,
	with_box_refine=True,
	as_two_stage=False,
	past_steps=4,
	fut_steps=4,
	transformer=dict(
	type='PerceptionTransformer',
	rotate_prev_bev=True,
	use_shift=True,
	use_can_bus=True,
	embed_dims=256,
	encoder=dict(
	type='BEVFormerEncoder',
	num_layers=6,
	pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
	num_points_in_pillar=4,
	return_intermediate=False,
	transformerlayers=dict(
	type='BEVFormerLayer',
	attn_cfgs=[
	dict(
	type='TemporalSelfAttention',
	embed_dims=256,
	num_levels=1),
	dict(
	type='SpatialCrossAttention',
	pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
	deformable_attention=dict(
	type='MSDeformableAttention3D',
	embed_dims=256,
	num_points=8,
	num_levels=4),
	embed_dims=256)
	],
	feedforward_channels=512,
	ffn_dropout=0.1,
	operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
	'ffn', 'norm'))),
	decoder=dict(
	type='DetectionTransformerDecoder',
	num_layers=6,
	return_intermediate=True,
	transformerlayers=dict(
	type='DetrTransformerDecoderLayer',
	attn_cfgs=[
	dict(
	type='MultiheadAttention',
	embed_dims=256,
	num_heads=8,
	dropout=0.1),
	dict(
	type='CustomMSDeformableAttention',
	embed_dims=256,
	num_levels=1)
	],
	feedforward_channels=512,
	ffn_dropout=0.1,
	operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
	'ffn', 'norm')))),
	bbox_coder=dict(
	type='NMSFreeCoder',
	post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
	pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
	max_num=300,
	voxel_size=[0.2, 0.2, 8],
	num_classes=10),
	positional_encoding=dict(
	type='LearnedPositionalEncoding',
	num_feats=128,
	row_num_embed=200,
	col_num_embed=200),
	loss_cls=dict(
	type='FocalLoss',
	use_sigmoid=True,
	gamma=2.0,
	alpha=0.25,
	loss_weight=2.0),
	loss_bbox=dict(type='L1Loss', loss_weight=0.25),
	loss_iou=dict(type='GIoULoss', loss_weight=0.0)),
	seg_head=dict(
	type='PansegformerHead',
	bev_h=200,
	bev_w=200,
	canvas_size=(200, 200),
	pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
	num_query=300,
	num_classes=4,
	num_things_classes=3,
	num_stuff_classes=1,
	in_channels=2048,
	sync_cls_avg_factor=True,
	as_two_stage=False,
	with_box_refine=True,
	transformer=dict(
	type='SegDeformableTransformer',
	encoder=dict(
	type='DetrTransformerEncoder',
	num_layers=6,
	transformerlayers=dict(
	type='BaseTransformerLayer',
	attn_cfgs=dict(
	type='MultiScaleDeformableAttention',
	embed_dims=256,
	num_levels=4),
	feedforward_channels=512,
	ffn_dropout=0.1,
	operation_order=('self_attn', 'norm', 'ffn', 'norm'))),
	decoder=dict(
	type='DeformableDetrTransformerDecoder',
	num_layers=6,
	return_intermediate=True,
	transformerlayers=dict(
	type='DetrTransformerDecoderLayer',
	attn_cfgs=[
	dict(
	type='MultiheadAttention',
	embed_dims=256,
	num_heads=8,
	dropout=0.1),
	dict(
	type='MultiScaleDeformableAttention',
	embed_dims=256,
	num_levels=4)
	],
	feedforward_channels=512,
	ffn_dropout=0.1,
	operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
	'ffn', 'norm')))),
	positional_encoding=dict(
	type='SinePositionalEncoding',
	num_feats=128,
	normalize=True,
	offset=-0.5),
	loss_cls=dict(
	type='FocalLoss',
	use_sigmoid=True,
	gamma=2.0,
	alpha=0.25,
	loss_weight=2.0),
	loss_bbox=dict(type='L1Loss', loss_weight=5.0),
	loss_iou=dict(type='GIoULoss', loss_weight=2.0),
	loss_mask=dict(type='DiceLoss', loss_weight=2.0),
	thing_transformer_head=dict(
	type='SegMaskHead', d_model=256, nhead=8, num_decoder_layers=4),
	stuff_transformer_head=dict(
	type='SegMaskHead',
	d_model=256,
	nhead=8,
	num_decoder_layers=6,
	self_attn=True),
	train_cfg=dict(
	assigner=dict(
	type='HungarianAssigner',
	cls_cost=dict(type='FocalLossCost', weight=2.0),
	reg_cost=dict(
	type='BBoxL1Cost', weight=5.0, box_format='xywh'),
	iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0)),
	assigner_with_mask=dict(
	type='HungarianAssigner_multi_info',
	cls_cost=dict(type='FocalLossCost', weight=2.0),
	reg_cost=dict(
	type='BBoxL1Cost', weight=5.0, box_format='xywh'),
	iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0),
	mask_cost=dict(type='DiceCost', weight=2.0)),
	sampler=dict(type='PseudoSampler'),
	sampler_with_mask=dict(type='PseudoSampler_segformer'))),
	train_cfg=dict(
	pts=dict(
	grid_size=[512, 512, 1],
	voxel_size=[0.2, 0.2, 8],
	point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0],
	out_size_factor=4,
	assigner=dict(
	type='HungarianAssigner3D',
	cls_cost=dict(type='FocalLossCost', weight=2.0),
	reg_cost=dict(type='BBox3DL1Cost', weight=0.25),
	iou_cost=dict(type='IoUCost', weight=0.0),
	pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]))))
	info_root = 'data/infos/'
	ann_file_train = 'data/infos/nuscenes_infos_temporal_train.pkl'
	ann_file_val = 'data/infos/nuscenes_infos_temporal_val.pkl'
	ann_file_test = 'data/infos/nuscenes_infos_temporal_val.pkl'
	optimizer = dict(
	type='AdamW',
	lr=0.0002,
	paramwise_cfg=dict(custom_keys=dict(img_backbone=dict(lr_mult=0.1))),
	weight_decay=0.01)
	optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
	lr_config = dict(
	policy='CosineAnnealing',
	warmup='linear',
	warmup_iters=500,
	warmup_ratio=0.3333333333333333,
	min_lr_ratio=0.001)
	total_epochs = 6
	runner = dict(type='EpochBasedRunner', max_epochs=6)
	find_unused_parameters = True
	logger_name = 'mmdet'
	gpu_ids = range(0, 1)