codenetwork · adynsydeez · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025
diff --git a/envs/__pycache__/simple_biped_env.cpython-312.pyc b/envs/__pycache__/simple_biped_env.cpython-312.pyc
diff --git a/envs/__pycache__/simple_biped_env.cpython-313.pyc b/envs/__pycache__/simple_biped_env.cpython-313.pyc
diff --git a/envs/simple_biped_env.py b/envs/simple_biped_env.py
@@ -9,20 +9,50 @@ def __init__(self, render=False, max_steps=10000):
         super(simpleBipedEnv, self).__init__()
         self.render_mode = render
         self.physicsClient = p.connect(p.GUI if render else p.DIRECT)
-        self.robot_id = p.loadURDF("walkers/simple_biped.urdf", [0, 0, 1.0])
         p.setAdditionalSearchPath(pybullet_data.getDataPath())
+        self.robot_id = p.loadURDF("walkers/simple_biped.urdf", [0, 0, 1.0])
+        self.plane_id = p.loadURDF("plane.urdf")
+
+        # Find foot link indices (adjust names to match your URDF!)
+        self.left_foot = 1   # replace with actual index
+        self.right_foot = 3  # replace with actual index
 
-        num_joints = 4  # left_hip, right_hip, left_knee, right_knee
+        num_joints = 4
         self.action_space = spaces.Box(low=-1, high=1, shape=(num_joints,), dtype=np.float32)
         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2*num_joints + 6,), dtype=np.float32)
         self.max_steps = max_steps
         self.current_step = 0
         self.last_action = None
+        self.off_ground_counter = 0   # track consecutive steps with no contacts
+
+    def _feet_in_contact(self):
+        left_contacts = p.getContactPoints(self.robot_id, self.plane_id, linkIndexA=self.left_foot)
+        right_contacts = p.getContactPoints(self.robot_id, self.plane_id, linkIndexA=self.right_foot)
+        return len(left_contacts) > 0, len(right_contacts) > 0
+
+    def _compute_reward(self):
+        base_pos, _ = p.getBasePositionAndOrientation(self.robot_id)
+        forward_reward = base_pos[0]
+        alive_bonus = 1 if not self._check_termination() else -500
+
+        # --- Penalize if both feet off ground for too long ---
+        left_contact, right_contact = self._feet_in_contact()
+        if not left_contact and not right_contact:
+            self.off_ground_counter += 1
+        else:
+            self.off_ground_counter = 0
+
+        airborne_penalty = -10 if self.off_ground_counter > 500 else 0  # tune threshold & penalty
+
+        return forward_reward + alive_bonus + airborne_penalty
 
 
 
     def reset(self, seed=None, options=None):
         p.resetBasePositionAndOrientation(self.robot_id, [0,0,1], [0,0,0,1])
+        num_joints = p.getNumJoints(self.robot_id)
+        for j in range(num_joints):
+            p.resetJointState(self.robot_id, j, targetValue=0.0, targetVelocity=0.0)
         p.setGravity(0, 0, -9.8)
         plane_id = p.loadURDF("plane.urdf")
         base_pos = p.getBasePositionAndOrientation(self.robot_id)[0]
@@ -45,7 +75,7 @@ def step(self, action):
         current_positions = np.array([state[0] for state in joint_states], dtype=np.float32)
 
         # Apply relative change to joint angles
-        delta = action * 0.05  # scale step size as needed
+        delta = action * 0.2  # scale step size as needed
         target_positions = current_positions + delta
 
         p.setJointMotorControlArray(
@@ -78,22 +108,6 @@ def _check_termination(self):
         base_pos = p.getBasePositionAndOrientation(self.robot_id)[0]
         return base_pos[2] < 0.2  or base_pos[2] > 1.5  # fallen
 
-    def _compute_reward(self):
-        base_pos, _ = p.getBasePositionAndOrientation(self.robot_id)
-        base_vel, _ = p.getBaseVelocity(self.robot_id)
-
-        forward_reward = base_vel[0]        # reward x velocity
-        if not self._check_termination(): 
-            alive_bonus = 1
-        else: 
-            alive_bonus = -500
-        torque_penalty = 0.001 * np.sum(np.square(self.last_action))
-
-        #print(forward_reward, alive_bonus, torque_penalty)
-
-        return forward_reward + alive_bonus - torque_penalty
-
-
     def render(self):
         if self.render_mode:
             if hasattr(self, 'robot_id'):

diff --git a/envs/test.py b/envs/test.py
@@ -15,15 +15,21 @@
     max_episode_steps=10000,
 )
 
-def train():
+def train_PPO():
     LOG_DIR = "./logs/"
     env = gym.make('simpleBiped-v0', render=False)
-    model = DDPG("MlpPolicy", env, verbose=1, device="auto", tensorboard_log=LOG_DIR)
-    model.learn(total_timesteps=10000000)
+    model = PPO("MlpPolicy", env, verbose=1, learning_rate=0.001, device="auto", tensorboard_log=LOG_DIR)
+    model.learn(total_timesteps=500000)
     model.save("../models/humanoid_ppo")
     env.close()
 
-
+def train_DDPG():
+    LOG_DIR = "./logs/"
+    env = gym.make('simpleBiped-v0', render_mode=True)
+    model = DDPG("MlpPolicy", env, verbose=1, device="auto", tensorboard_log=LOG_DIR)
+    model.learn(total_timesteps=10000000)
+    model.save("../models/humanoid_ddpg")
+    env.close()
 
 def run():
     env = DummyVecEnv([lambda: simpleBipedEnv(render=True)])
@@ -58,6 +64,7 @@ def test():
         print(f"Test episode finished. Total reward: {total_reward}")
     env.close()
 
-train()
+train_PPO()
+#train_DDPG()
 #run()
 #test()
diff --git a/logs/DDPG_1/events.out.tfevents.1756720705.LAPTOP-U4CU9OKR.12120.0 b/logs/DDPG_1/events.out.tfevents.1756720705.LAPTOP-U4CU9OKR.12120.0
diff --git a/logs/DDPG_1/events.out.tfevents.1756729277.DESKTOP-Q8AMD27.35468.0 b/logs/DDPG_1/events.out.tfevents.1756729277.DESKTOP-Q8AMD27.35468.0
diff --git a/logs/PPO_1/events.out.tfevents.1756717278.LAPTOP-U4CU9OKR.26564.0 b/logs/PPO_1/events.out.tfevents.1756717278.LAPTOP-U4CU9OKR.26564.0
diff --git a/logs/PPO_1/events.out.tfevents.1756729290.DESKTOP-Q8AMD27.4012.0 b/logs/PPO_1/events.out.tfevents.1756729290.DESKTOP-Q8AMD27.4012.0
diff --git a/logs/PPO_2/events.out.tfevents.1756718026.LAPTOP-U4CU9OKR.25076.0 b/logs/PPO_2/events.out.tfevents.1756718026.LAPTOP-U4CU9OKR.25076.0
diff --git a/logs/PPO_2/events.out.tfevents.1756784683.DESKTOP-Q8AMD27.39556.0 b/logs/PPO_2/events.out.tfevents.1756784683.DESKTOP-Q8AMD27.39556.0
diff --git a/logs/PPO_3/events.out.tfevents.1756718088.LAPTOP-U4CU9OKR.12380.0 b/logs/PPO_3/events.out.tfevents.1756718088.LAPTOP-U4CU9OKR.12380.0
diff --git a/logs/PPO_3/events.out.tfevents.1757319344.LAPTOP-U4CU9OKR.24828.0 b/logs/PPO_3/events.out.tfevents.1757319344.LAPTOP-U4CU9OKR.24828.0
diff --git a/logs/PPO_4/events.out.tfevents.1757320046.LAPTOP-U4CU9OKR.8248.0 b/logs/PPO_4/events.out.tfevents.1757320046.LAPTOP-U4CU9OKR.8248.0
diff --git a/logs/PPO_5/events.out.tfevents.1757322660.LAPTOP-U4CU9OKR.30516.0 b/logs/PPO_5/events.out.tfevents.1757322660.LAPTOP-U4CU9OKR.30516.0
diff --git a/requirements.txt b/requirements.txt
@@ -2,4 +2,5 @@ gymnasium==0.29.1
 pybullet==3.2.5
 stable-baselines3==2.2.1
 numpy>=1.24.0
-matplotlib>=3.7.0
+matplotlib>=3.7.0
+tensorboard==2.20.0