deftrain(records): s_batch = np.array([record[0] for record in records]) a_batch = np.array([[1if record[1] == i else0for i inrange(ACTION_DIM)] for record in records]) prob_batch = model.predict(s_batch) * a_batch r_batch = discount_rewards([record[2] for record in records])
episodes = 2000 score_list = [] for i inrange(episodes): s = env.reset() score = 0 replay_records = [] whileTrue: a = choose_action(s) next_s, r, done, _ = env.step(a) replay_records.append((s, a, r))
score += r s = next_s if done: train(replay_records) score_list.append(score) print('episode:', i, 'score:', score, 'max:', max(score_list)) break if np.mean(score_list[-10:]) > 195: model.save('CartPole-v0-pg.h5') break env.close()
deftrain(records): s_batch = np.array([record[0] for record in records]) a_batch = np.array([[1if record[1] == i else0for i inrange(ACTION_DIM)] for record in records]) prob_batch = model.predict(s_batch) * a_batch r_batch = discount_rewards([record[2] for record in records])
episodes = 2000 score_list = [] for i inrange(episodes): s = env.reset() score = 0 replay_records = [] whileTrue: a = choose_action(s) next_s, r, done, _ = env.step(a) replay_records.append((s, a, r))
score += r s = next_s if done: train(replay_records) score_list.append(score) print('episode:', i, 'score:', score, 'max:', max(score_list)) break if np.mean(score_list[-10:]) > 195: model.save('CartPole-v0-pg.h5') break env.close()