云深处Lite3基于isaaclab的rl_training代码结构及其奖励函数分析

git仓库。
代码总体是基于manager的isaaclab rl框架，实现Lite3和M20(四足带轮)基于速度控制的强化学习，主要文件结构和IsaacLab基本一致。

├── logs.txt // 运行时的终端输出
├── docs // readme图片
├── logs // 记录训练的ckpt和video等
├── outputs // hydra的log和参数yaml
│   └── 2025-09-03
├── scripts
│   ├── reinforcement_learning //标准rsl_rl库
│   │   ├── rl_utils.py
│   │   └── rsl_rl 
│   │       └── train.py //处理参数，再通过@hydra_task_config传入task和agent获得agent_cfg和env_cfg。
│   │       └── play.py
│   │       └── cli_args.py //处理命令行参数
│   └── tools //可独立运行的小工具
│       ├── check_robot.py
│       ├── clean_trash.py
│       ├── convert_mjcf.py
│       ├── convert_urdf.py
│       └── list_envs.py
├── source //主要代码
    └── rl_training
        ├── config //配置isaaclab的extension
        ├── data //模型文件
        ├── pyproject.toml //pip构建系统配置
        ├── rl_training //rl的主要代码
        └── setup.py //pip模块安装

对于其中的rl_training进一步展开，主要文件如下。

rl_training
├── assets
│   ├── deeprobotics.py //配置轮式和足式的ArticulationCfg
│   └── utils
│       └── usd_converter.py
├── tasks
│   ├── manager_based //配置isaaclab的manager
│   │   ├── locomotion //运动方面的学习
│   │   │   └── velocity  //基于速度的运动学习
│   │   │       ├── config //env cfg
│   │   │       │   ├── quadruped //四足的env cfg
│   │   │       │   │   ├── deeprobotics_lite3
│   │   │       │   │   │   ├── __init__.py //配置了gym.registry，指定终端命令中的task name的具体配置
│   │   │       │   │   │   ├── agents
│   │   │       │   │   │   │   ├── rsl_rl_ppo_cfg.py //配置RL的agent参数
│   │   │       │   │   │   ├── flat_env_cfg.py //平地的env cfg，基于rough
│   │   │       │   │   │   └── rough_env_cfg.py//主要的env cfg，继承了velocity_env_cfg
│   │   │       │   └── wheeled//轮式的env cfg，类似于quadruped
│   │   │       │       ├── deeprobotics_m20
│   │   │       │       │   ├── agents
│   │   │       │       │   │   └── rsl_rl_ppo_cfg.py
│   │   │       │       │   ├── flat_env_cfg.py
│   │   │       │       │   └── rough_env_cfg.py
│   │   │       ├── mdp //定义了mdp各模块用到的func
│   │   │       │   ├── commands.py
│   │   │       │   ├── curriculums.py
│   │   │       │   ├── events.py
│   │   │       │   ├── observations.py
│   │   │       │   └── **rewards.py** //各种基础动作的奖励函数
│   │   │       └── **velocity_env_cfg.py** //主要的env cfg文件，配置manager各模块

isaaclab基于manager的rl配置主要在env cfg，其包括基础的scene, observation, action, reward, termination, event，以及其他如command, curriculum。
velocity_env_cfg.py中的LocomotionVelocityRoughEnvCfg是主要的env cfg，flat_env_cfg和rough_env_cfg都继承自它。它包含了MySceneCfg, ActionCfg, ObservationCfg, CommandsCfg, RewardsCfg, TerminationsCfg, EventCfg, CurriculumCfg。
而env cfg中最重要的是reward，因此可以梳理一下reward.py中所有的reward项。

当前训练目标参考readme中对play的说明，应该是使机器狗能够按照指定速度移动，其中线速度包括自身的xy轴平面内的任意方向移动、以及绕z轴的旋转（yaw），支持实时的加减速。
下面列出所有reward项，其中Flat训练中用到的用#Flat标注，Rough的也同理。

track_lin_vel_xy_exp #Flat=3

def track_lin_vel_xy_exp(
    env: ManagerBasedRLEnv, std: float, command_name: str, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")
) -> torch.Tensor:
    """Reward tracking of linear velocity commands (xy axes) using exponential kernel."""
    # extract the used quantities (to enable type-hinting)
    asset: RigidObject = env.scene[asset_cfg.name]
    # compute the error
    lin_vel_error = torch.sum(
        torch.square(env.command_manager.get_command(command_name)[:, :2] - asset.data.root_lin_vel_b[:, :2]),
        dim=1,
    )
    reward = torch.exp(-lin_vel_error / std**2)
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

该奖励的核心因子为command中的xy平面的速度和当前在xy平面的速度之差，通过指数核函数转化为一个非线性负相关的奖励：误差为0时奖励为1，误差越大，奖励越接近于0。std起到调节作用，同样的误差水平，std越小则奖励值越低。

track_ang_vel_z_exp #Flat=1.5

def track_ang_vel_z_exp(
    env: ManagerBasedRLEnv, std: float, command_name: str, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")
) -> torch.Tensor:
    """Reward tracking of angular velocity commands (yaw) using exponential kernel."""
    # extract the used quantities (to enable type-hinting)
    asset: RigidObject = env.scene[asset_cfg.name]
    # compute the error
    ang_vel_error = torch.square(env.command_manager.get_command(command_name)[:, 2] - asset.data.root_ang_vel_b[:, 2])
    reward = torch.exp(-ang_vel_error / std**2)
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

该奖励和上一个基本一致，只是对xy的线速度改为对z的角速度。

track_lin_vel_xy_yaw_frame_exp

def track_lin_vel_xy_yaw_frame_exp(
    env, std: float, command_name: str, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")
) -> torch.Tensor:
    """Reward tracking of linear velocity commands (xy axes) in the gravity aligned robot frame using exponential kernel."""
    # extract the used quantities (to enable type-hinting)
    asset = env.scene[asset_cfg.name]
    vel_yaw = quat_apply_inverse(yaw_quat(asset.data.root_quat_w), asset.data.root_lin_vel_w[:, :3])
    lin_vel_error = torch.sum(
        torch.square(env.command_manager.get_command(command_name)[:, :2] - vel_yaw[:, :2]), dim=1
    )
    reward = torch.exp(-lin_vel_error / std**2)
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

该奖励主要考虑在坡度环境下机器人自身的xy平面速度并不契合真实平面的速度，所以用yaw_quat()计算了一个机器人姿态的四元数，其对应的坐标系是yaw frame，再用quat_apply_inverse()将机器人在世界坐标系下的速度转换到yaw frame的速度。另外，quat_apply()是将局部坐标系转世界。

track_ang_vel_z_world_exp

def track_ang_vel_z_world_exp(
    env, command_name: str, std: float, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")
) -> torch.Tensor:
    """Reward tracking of angular velocity commands (yaw) in world frame using exponential kernel."""
    # extract the used quantities (to enable type-hinting)
    asset = env.scene[asset_cfg.name]
    ang_vel_error = torch.square(env.command_manager.get_command(command_name)[:, 2] - asset.data.root_ang_vel_w[:, 2])
    reward = torch.exp(-ang_vel_error / std**2)
    reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

该奖励计算世界坐标系中z轴角速度的误差，主要消除坡度的影响，适合需要全局方向控制的场景。

joint_power #Flat=-2e-05

def joint_power(env: ManagerBasedRLEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")) -> torch.Tensor:
    """Reward joint_power"""
    # extract the used quantities (to enable type-hinting)
    asset: Articulation = env.scene[asset_cfg.name]
    # compute the reward
    reward = torch.sum(
        torch.abs(asset.data.joint_vel[:, asset_cfg.joint_ids] * asset.data.applied_torque[:, asset_cfg.joint_ids]),
        dim=1,
    )
    return reward

该奖励对asset_cfg.joint_ids指定的所有关节计算功率（角速度×力矩）求和，惩罚过大的瞬时功率。

stand_still_without_cmd #Flat=-2

def stand_still_without_cmd(
    env: ManagerBasedRLEnv,
    command_name: str,
    command_threshold: float,
    asset_cfg: SceneEntityCfg = SceneEntityCfg("robot"),
) -> torch.Tensor:
    """Penalize joint positions that deviate from the default one when no command."""
    # extract the used quantities (to enable type-hinting)
    asset: Articulation = env.scene[asset_cfg.name]
    # compute out of limits constraints
    diff_angle = asset.data.joint_pos[:, asset_cfg.joint_ids] - asset.data.default_joint_pos[:, asset_cfg.joint_ids]
    reward = torch.sum(torch.abs(diff_angle), dim=1)
    reward *= torch.linalg.norm(env.command_manager.get_command(command_name), dim=1) < command_threshold # 指令范数，越大代表速度指令越明确
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

如果存在有效的速度指令则该奖励项的值为0，否则就是所有关节位置和默认（站立）位置的误差和。作用是在无有效指令时，让机器人保持稳定减少抖动。

joint_pos_penalty #Flat=-0.1

def joint_pos_penalty(
    env: ManagerBasedRLEnv,
    command_name: str,
    asset_cfg: SceneEntityCfg,
    stand_still_scale: float,
    velocity_threshold: float,
    command_threshold: float,
) -> torch.Tensor:
    """Penalize joint position error from default on the articulation."""
    # extract the used quantities (to enable type-hinting)
    asset: Articulation = env.scene[asset_cfg.name]
    cmd = torch.linalg.norm(env.command_manager.get_command(command_name), dim=1)# 指令范数，越大代表速度指令越明确
    body_vel = torch.linalg.norm(asset.data.root_lin_vel_b[:, :2], dim=1) # 机器人在xy平面的速度
    running_reward = torch.linalg.norm(
        (asset.data.joint_pos[:, asset_cfg.joint_ids] - asset.data.default_joint_pos[:, asset_cfg.joint_ids]), dim=1
    )# 机器人关节偏离默认位置的程度
    reward = torch.where(
        torch.logical_or(cmd > command_threshold, body_vel > velocity_threshold),# 只要是执行任务或者在移动
        running_reward, #就使用基础的惩罚
        stand_still_scale * running_reward, # 否则如果是静止，则放大该值
    )
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

该奖励根据‘是否有任务’和‘是否在运动’，用不同强度惩罚关节偏离默认姿势 —— 执行任务 / 运动时放松惩罚（保灵活），无指令 / 静止时强化惩罚（保稳定）。具体见注释

wheel_vel_penalty

略

GaitReward

略

joint_mirror #Flat=-0.05

def joint_mirror(env: ManagerBasedRLEnv, asset_cfg: SceneEntityCfg, mirror_joints: list[list[str]]) -> torch.Tensor:
    # extract the used quantities (to enable type-hinting)
    asset: Articulation = env.scene[asset_cfg.name]
   #如没有缓存则创建缓存，提高查找效率
    if not hasattr(env, "joint_mirror_joints_cache") or env.joint_mirror_joints_cache is None: 
        # Cache joint positions for all pairs
        env.joint_mirror_joints_cache = [
            [asset.find_joints(joint_name) for joint_name in joint_pair] for joint_pair in mirror_joints
        ]
    reward = torch.zeros(env.num_envs, device=env.device)
    # Iterate over all joint pairs，计算每个镜像关节对的对称误差并累加
    for joint_pair in env.joint_mirror_joints_cache:
        # Calculate the difference for each pair and add to the total reward
        diff = torch.sum(
            torch.square(asset.data.joint_pos[:, joint_pair[0][0]] - asset.data.joint_pos[:, joint_pair[1][0]]),
            dim=-1,
        )
        reward += diff
    # 若有镜像关节对（len(mirror_joints) > 0），则总惩罚值除以关节对数量，实现 “归一化”；若无关节对，则惩罚值设为 0；
    reward *= 1 / len(mirror_joints) if len(mirror_joints) > 0 else 0
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

该奖励项主要惩罚机器人对称关节的位置偏离，尽量使关节保持镜像对称。具体见注释

action_mirror

略

action_sync

略

feet_air_time_positive_biped

def feet_air_time_positive_biped(env, command_name: str, threshold: float, sensor_cfg: SceneEntityCfg) -> torch.Tensor:
    """Reward long steps taken by the feet for bipeds.

    This function rewards the agent for taking steps up to a specified threshold and also keep one foot at
    a time in the air.

    If the commands are small (i.e. the agent is not supposed to take a step), then the reward is zero.
    """
    contact_sensor: ContactSensor = env.scene.sensors[sensor_cfg.name]
    # compute the reward
    air_time = contact_sensor.data.current_air_time[:, sensor_cfg.body_ids]
    contact_time = contact_sensor.data.current_contact_time[:, sensor_cfg.body_ids]
    in_contact = contact_time > 0.0
    in_mode_time = torch.where(in_contact, contact_time, air_time)
    single_stance = torch.sum(in_contact.int(), dim=1) == 1
    reward = torch.min(torch.where(single_stance.unsqueeze(-1), in_mode_time, 0.0), dim=1)[0]
    reward = torch.clamp(reward, max=threshold)
    # no reward for zero command
    reward *= torch.norm(env.command_manager.get_command(command_name)[:, :2], dim=1) > 0.1
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

奖励双足机器人迈出更长的步距

feet_air_time_variance_penalty

惩罚足滞空时间的方差。

feet_contact

def feet_contact(
    env: ManagerBasedRLEnv, command_name: str, expect_contact_num: int, sensor_cfg: SceneEntityCfg
) -> torch.Tensor:
    """Reward feet contact"""
    # extract the used quantities (to enable type-hinting)
    contact_sensor: ContactSensor = env.scene.sensors[sensor_cfg.name]
    # compute the reward
    contact = contact_sensor.compute_first_contact(env.step_dt)[:, sensor_cfg.body_ids]
    contact_num = torch.sum(contact, dim=1)
    reward = (contact_num != expect_contact_num).float()# 接触数量不一致时值为1
    # no reward for zero command
    reward *= torch.linalg.norm(env.command_manager.get_command(command_name), dim=1) > 0.5
    # 摔倒时惩罚值置零
    reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

惩罚足部触地数量与预期不符的情况。具体见注释。主要用于步态控制和姿态稳定。

feet_contact_without_cmd #Flat=0.1

def feet_contact_without_cmd(env: ManagerBasedRLEnv, command_name: str, sensor_cfg: SceneEntityCfg) -> torch.Tensor:
    """Reward feet contact"""
    # extract the used quantities (to enable type-hinting)
    contact_sensor: ContactSensor = env.scene.sensors[sensor_cfg.name]
    # compute the reward
    contact = contact_sensor.compute_first_contact(env.step_dt)[:, sensor_cfg.body_ids]
    # print(contact, "contact")
    reward = torch.sum(contact, dim=-1).float()
    # print(reward, "reward after sum")
    reward *= torch.linalg.norm(env.command_manager.get_command(command_name), dim=1) < 0.5
    # print(env.command_manager.get_command(command_name), "env.command_manager.get_command(command_name)")
    # print(reward, "reward after multiply")
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

无指令时奖励触地足的数量。

feet_stumble #Flat=-0.1

def feet_stumble(env: ManagerBasedRLEnv, sensor_cfg: SceneEntityCfg) -> torch.Tensor:
    # extract the used quantities (to enable type-hinting)
    contact_sensor: ContactSensor = env.scene.sensors[sensor_cfg.name]
    forces_z = torch.abs(contact_sensor.data.net_forces_w[:, sensor_cfg.body_ids, 2])
    forces_xy = torch.linalg.norm(contact_sensor.data.net_forces_w[:, sensor_cfg.body_ids, :2], dim=2)
    # Penalize feet hitting vertical surfaces
    reward = torch.any(forces_xy > 4 * forces_z, dim=1).float()
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

惩罚绊倒，条件是当水平面受力超过垂直方向4倍时。

feet_distance_y_exp

def feet_distance_y_exp(
    env: ManagerBasedRLEnv, stance_width: float, std: float, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")
) -> torch.Tensor:
    asset: RigidObject = env.scene[asset_cfg.name]
# 将世界坐标转为相对本体的
    cur_footsteps_translated = asset.data.body_link_pos_w[:, asset_cfg.body_ids, :] - asset.data.root_link_pos_w[
        :, :
    ].unsqueeze(1)
# 将本体坐标根据本体姿态进一步区分出前后左右四脚的坐标
    n_feet = len(asset_cfg.body_ids)
    footsteps_in_body_frame = torch.zeros(env.num_envs, n_feet, 3, device=env.device)
    for i in range(n_feet):
        footsteps_in_body_frame[:, i, :] = math_utils.quat_apply(
            math_utils.quat_conjugate(asset.data.root_link_quat_w), cur_footsteps_translated[:, i, :]
        )
    side_sign = torch.tensor(
        [1.0 if i % 2 == 0 else -1.0 for i in range(n_feet)],
        device=env.device,
    )
# 计算预期足部宽度
    stance_width_tensor = stance_width * torch.ones([env.num_envs, 1], device=env.device)
    desired_ys = stance_width_tensor / 2 * side_sign.unsqueeze(0)
# 计算偏差的平方
    stance_diff = torch.square(desired_ys - footsteps_in_body_frame[:, :, 1])
# 求和算指数
    reward = torch.exp(-torch.sum(stance_diff, dim=1) / (std**2))
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

奖励足部站立在Y轴（宽度）上的位置和预期stance_width的一致程度。

feet_distance_xy_exp

与上类似，同时计算在X轴上的位置偏差。

feet_height

def feet_height(
    env: ManagerBasedRLEnv,
    command_name: str,
    asset_cfg: SceneEntityCfg,
    target_height: float,
    tanh_mult: float,
) -> torch.Tensor:
    """Reward the swinging feet for clearing a specified height off the ground"""
    asset: RigidObject = env.scene[asset_cfg.name]
    foot_z_target_error = torch.square(asset.data.body_pos_w[:, asset_cfg.body_ids, 2] - target_height)
    # foot_velocity_tanh = torch.tanh(
    #     tanh_mult * torch.linalg.norm(asset.data.body_lin_vel_w[:, asset_cfg.body_ids, :2], dim=2)
    # )
    # reward = torch.sum(foot_z_target_error * foot_velocity_tanh, dim=1)
    reward = torch.sum(foot_z_target_error, dim=1)
    # print(foot_z_target_error, "foot_z_target_error")
    # no reward for zero command
    reward *= torch.linalg.norm(env.command_manager.get_command(command_name), dim=1) > 0.2
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

惩罚足部高度与预期高度的偏差。

feet_height_body #Flat=-5.0

def feet_height_body(
    env: ManagerBasedRLEnv,
    command_name: str,
    asset_cfg: SceneEntityCfg,
    target_height: float,
    tanh_mult: float,
) -> torch.Tensor:
    """Reward the swinging feet for clearing a specified height off the ground"""
    asset: RigidObject = env.scene[asset_cfg.name]
    cur_footpos_translated = asset.data.body_pos_w[:, asset_cfg.body_ids, :] - asset.data.root_pos_w[:, :].unsqueeze(1)
    footpos_in_body_frame = torch.zeros(env.num_envs, len(asset_cfg.body_ids), 3, device=env.device)
    cur_footvel_translated = asset.data.body_lin_vel_w[:, asset_cfg.body_ids, :] - asset.data.root_lin_vel_w[
        :, :
    ].unsqueeze(1)
    footvel_in_body_frame = torch.zeros(env.num_envs, len(asset_cfg.body_ids), 3, device=env.device)
    for i in range(len(asset_cfg.body_ids)):
        # 获得机体坐标系下的z轴高度
        footpos_in_body_frame[:, i, :] = math_utils.quat_apply_inverse(
            asset.data.root_quat_w, cur_footpos_translated[:, i, :]
        )
        # 获得机体坐标系下的xy速度 
        footvel_in_body_frame[:, i, :] = math_utils.quat_apply_inverse(
            asset.data.root_quat_w, cur_footvel_translated[:, i, :]
        )
# 获得偏差平方
    foot_z_target_error = torch.square(footpos_in_body_frame[:, :, 2] - target_height).view(env.num_envs, -1)
# 计算权重
    foot_velocity_tanh = torch.tanh(tanh_mult * torch.norm(footvel_in_body_frame[:, :, :2], dim=2))
    reward = torch.sum(foot_z_target_error * foot_velocity_tanh, dim=1)
    reward *= torch.linalg.norm(env.command_manager.get_command(command_name), dim=1) > 0.5
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

与上类似，但一方面考虑的是机体坐标系下的高度，一方面加入了速度权重，速度越大，高度偏差的惩罚项越大，防止碰撞等。

feet_slide #Flat=-0.1

def feet_slide(
    env: ManagerBasedRLEnv, sensor_cfg: SceneEntityCfg, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")
) -> torch.Tensor:
    """Penalize feet sliding.

    This function penalizes the agent for sliding its feet on the ground. The reward is computed as the
    norm of the linear velocity of the feet multiplied by a binary contact sensor. This ensures that the
    agent is penalized only when the feet are in contact with the ground.
    """
    # Penalize feet sliding
    contact_sensor: ContactSensor = env.scene.sensors[sensor_cfg.name]
    contacts = contact_sensor.data.net_forces_w_history[:, :, sensor_cfg.body_ids, :].norm(dim=-1).max(dim=1)[0] > 1.0#接触力>1视为接触
    asset: RigidObject = env.scene[asset_cfg.name]

    # feet_vel = asset.data.body_lin_vel_w[:, asset_cfg.body_ids, :2]
    # reward = torch.sum(feet_vel.norm(dim=-1) * contacts, dim=1)
# 步骤1：计算足部相对于根节点的世界坐标系速度偏移
    cur_footvel_translated = asset.data.body_lin_vel_w[:, asset_cfg.body_ids, :] - asset.data.root_lin_vel_w[
        :, :
    ].unsqueeze(1)
# 步骤2：初始化机体坐标系下的足部速度存储
    footvel_in_body_frame = torch.zeros(env.num_envs, len(asset_cfg.body_ids), 3, device=env.device)
# 步骤3：逐足将相对速度从世界坐标系转换到机体坐标系
    for i in range(len(asset_cfg.body_ids)):
        footvel_in_body_frame[:, i, :] = math_utils.quat_apply_inverse(
            asset.data.root_quat_w, cur_footvel_translated[:, i, :]
        )
# 计算每个足的xy轴滑动速度平方和
    foot_leteral_vel = torch.sqrt(torch.sum(torch.square(footvel_in_body_frame[:, :, :2]), dim=2)).view(
        env.num_envs, -1
    )
    reward = torch.sum(foot_leteral_vel * contacts, dim=1)
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

惩罚足部与地面的滑动

upward #Flat=0.15

def upward(env: ManagerBasedRLEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")) -> torch.Tensor:
    """Penalize z-axis base linear velocity using L2 squared kernel."""
    # extract the used quantities (to enable type-hinting)
    asset: RigidObject = env.scene[asset_cfg.name]
# projected_gravity_b[:,2] 反映自身z轴和重力的对齐程度，完全对齐时其值为-1，否则完全摔倒时接近1
    reward = torch.square(1 - asset.data.projected_gravity_b[:, 2])
    return reward

奖励机体z轴和重力的对齐程度。

base_height_l2 #Flat=-10

def base_height_l2(
    env: ManagerBasedRLEnv,
    target_height: float,
    asset_cfg: SceneEntityCfg = SceneEntityCfg("robot"),
    sensor_cfg: SceneEntityCfg | None = None,
) -> torch.Tensor:
    """Penalize asset height from its target using L2 squared kernel.

    Note:
        For flat terrain, target height is in the world frame. For rough terrain,
        sensor readings can adjust the target height to account for the terrain.
    """
    # extract the used quantities (to enable type-hinting)
    asset: RigidObject = env.scene[asset_cfg.name]
    if sensor_cfg is not None:
        sensor: RayCaster = env.scene[sensor_cfg.name]
        # Adjust the target height using the sensor data
        ray_hits = sensor.data.ray_hits_w[..., 2]
        if torch.isnan(ray_hits).any() or torch.isinf(ray_hits).any() or torch.max(torch.abs(ray_hits)) > 1e6:
            adjusted_target_height = asset.data.root_link_pos_w[:, 2]
        else:
            adjusted_target_height = target_height + torch.mean(ray_hits, dim=1)
    else:
        # Use the provided target height directly for flat terrain
        adjusted_target_height = target_height
    # Compute the L2 squared penalty
    reward = torch.square(asset.data.root_pos_w[:, 2] - adjusted_target_height)
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

惩罚机体高度和预期高度的偏差

lin_vel_z_l2 #Flat=-2

def lin_vel_z_l2(env: ManagerBasedRLEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")) -> torch.Tensor:
    """Penalize z-axis base linear velocity using L2 squared kernel."""
    # extract the used quantities (to enable type-hinting)
    asset: RigidObject = env.scene[asset_cfg.name]
    reward = torch.square(asset.data.root_lin_vel_b[:, 2])
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

惩罚机体在z轴的移动

ang_vel_xy_l2 #Flat=-0.05

def ang_vel_xy_l2(env: ManagerBasedRLEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")) -> torch.Tensor:
    """Penalize xy-axis base angular velocity using L2 squared kernel."""
    # extract the used quantities (to enable type-hinting)
    asset: RigidObject = env.scene[asset_cfg.name]
    reward = torch.sum(torch.square(asset.data.root_ang_vel_b[:, :2]), dim=1)
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

惩罚机体在x/y轴的旋转

undesired_contacts

def undesired_contacts(env: ManagerBasedRLEnv, threshold: float, sensor_cfg: SceneEntityCfg) -> torch.Tensor:
    """Penalize undesired contacts as the number of violations that are above a threshold."""
    # extract the used quantities (to enable type-hinting)
    contact_sensor: ContactSensor = env.scene.sensors[sensor_cfg.name]
    # check if contact force is above threshold
    net_contact_forces = contact_sensor.data.net_forces_w_history
    is_contact = torch.max(torch.norm(net_contact_forces[:, :, sensor_cfg.body_ids], dim=-1), dim=1)[0] > threshold
    # sum over contacts for each environment
    reward = torch.sum(is_contact, dim=1).float()
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

flat_orientation_l2

def flat_orientation_l2(env: ManagerBasedRLEnv, asset_cfg: SceneEntityCfg = SceneEntityCfg("robot")) -> torch.Tensor:
    """Penalize non-flat base orientation using L2 squared kernel.

    This is computed by penalizing the xy-components of the projected gravity vector.
    """
    # extract the used quantities (to enable type-hinting)
    asset: RigidObject = env.scene[asset_cfg.name]
    reward = torch.sum(torch.square(asset.data.projected_gravity_b[:, :2]), dim=1)
    # reward *= torch.clamp(-env.scene["robot"].data.projected_gravity_b[:, 2], 0, 0.7) / 0.7
    return reward

机器人基座水平姿态惩罚函数，用 “投影重力 XY 分量” 判断基座水平性

以上奖励项均为reward.py中列出，其他还有部分在isaaclab.envs.mdp中直接定义。

joint_torques_l2  # -2.5e-05
joint_acc_l2 # -1e-07
joint_pos_limits #-5.0
action_rate_l2 # -0.01
contact_forces # -0.00015