У меня несоответствие форм входных данных и модели моего проекта обучения с подкреплением.
Я внимательно слежу за примерами AWS, особенно за примером тележки. Однако я создал свою собственную среду. Я изо всех сил пытаюсь понять, как изменить мою среду, чтобы она могла работать с предварительно созданным Ray RLEstimator.
Вот код среды:
from enum import Enum
import math
import gym
from gym import error, spaces, utils, wrappers
from gym.utils import seeding
from gym.envs.registration import register
from gym.spaces import Discrete, Box
import numpy as np
# from float_space import FloatSpace
def sigmoid_price_fun(x, maxcust, gamma):
return maxcust / (1 + math.exp(gamma * max(0, x)))
class Actions(Enum):
DECREASE_PRICE = 0
INCREASE_PRICE = 1
HOLD = 2
PRICE_ADJUSTMENT = {
Actions.DECREASE_PRICE: -0.25,
Actions.INCREASE_PRICE: 0.25,
Actions.HOLD: 0
}
class ArrivalSim(gym.Env):
""" Simple environment for price optimising RL learner. """
def __init__(self, price):
"""
Parameters
----------
price : float
The initial price to use.
"""
super().__init__()
self.price = price
self.revenue = 0
self.action_space = Discrete(3) # [0, 1, 2] #increase or decrease
self.observation_space = Box(np.array(0.0),np.array(1000))
# self.observation_space = FloatSpace(price)
def step(self, action):
""" Enacts the specified action in the environment.
Returns the new price, reward, whether we're finished and an empty dict for compatibility with Gym's
interface. """
self._take_action(Actions(action))
next_state = self.price
# next_state = self.observation_space.sample()
reward = self._get_reward()
done = False
if next_state < 0 or reward == 0:
done = True
print(next_state, reward, done, {})
return np.array(next_state), reward, done, {}
def reset(self):
""" Resets the environment, selecting a random initial price. Returns the price. """
# self.observation_space.value = np.random.rand()
# return self.observation_space.sample()
self.price = np.random.rand()
return self.price
def _take_action(self, action):
# self.observation_space.value += PRICE_ADJUSTMENT[action]
self.price += PRICE_ADJUSTMENT[action]
def _get_reward(self,price):
# price = self.observation_space.value
# return max(np.random.poisson(sigmoid_price_fun(price, 50, 0.5)) * price, 0)
self.revenue = max(np.random.poisson(sigmoid_price_fun(self.price, 50, 0.5)) * self.price, 0)
return max(np.random.poisson(sigmoid_price_fun(self.price, 50, 0.5)) * self.price, 0)
# def render(self, mode='human'):
# super().render(mode)
def testEnv():
register(
id='ArrivalSim-v0',
entry_point='env:ArrivalSim',
kwargs= {'price' : 40}
)
env = gym.make('ArrivalSim-v0')
env.reset()
for _ in range(20):
test = env.action_space.sample()
print(test)
print(env.observation_space)
env.step(test) # take a random action
env.close()
if __name__ =='__main__':
testEnv()
Вот сценарий обучения
import json
import os
import gym
import ray
from ray.tune import run_experiments
from ray.tune.registry import register_env
from gym.envs.registration import register
from sagemaker_rl.ray_launcher import SageMakerRayLauncher
def create_environment(env_config):
import gym
# from gym.spaces import Space
from gym.envs.registration import register
# This import must happen inside the method so that worker processes import this code
register(
id='ArrivalSim-v0',
entry_point='env:ArrivalSim',
kwargs= {'price' : 40}
)
return gym.make('ArrivalSim-v0')
class MyLauncher(SageMakerRayLauncher):
def register_env_creator(self):
register_env("ArrivalSim-v0", create_environment)
def get_experiment_config(self):
return {
"training": {
"env": "ArrivalSim-v0",
"run": "PPO",
"stop": {
"episode_reward_mean": 5000,
},
"config": {
"gamma": 0.995,
"kl_coeff": 1.0,
"num_sgd_iter": 10,
"lr": 0.0001,
"sgd_minibatch_size": 32768,
"train_batch_size": 320000,
"monitor": False, # Record videos.
"model": {
"free_log_std": False
},
"use_gae": False,
"num_workers": (self.num_cpus-1),
"num_gpus": self.num_gpus,
"batch_mode": "complete_episodes"
}
}
}
if __name__ == "__main__":
MyLauncher().train_main()
Вот код, который я запускаю в Jupyter:
metric_definitions = RLEstimator.default_metric_definitions(RLToolkit.RAY)
environment = env = {
'SAGEMAKER_REQUIREMENTS': 'requirements.txt', # path relative to `source_dir` below.
}
estimator = RLEstimator(entry_point="train.py",
source_dir='.',
toolkit=RLToolkit.RAY,
toolkit_version='0.6.5',
framework=RLFramework.TENSORFLOW,
dependencies=["sagemaker_rl"],
# image_name='price-response-ray-cpu',
role=role,
# train_instance_type="ml.c5.2xlarge",
train_instance_type='local',
train_instance_count=1,
# output_path=s3_output_path,
# base_job_name=job_name_prefix,
metric_definitions=metric_definitions
# hyperparameters={
# Attention scientists! You can override any Ray algorithm parameter here:
#"rl.training.config.horizon": 5000,
#"rl.training.config.num_sgd_iter": 10,
#}
)
estimator.fit(wait=True)
job_name = estimator.latest_training_job.job_name
print("Training job: %s" % job_name)
Я получаю следующее сообщение об ошибке:
algo-1-dxwxx_1 | == Status ==
algo-1-dxwxx_1 | Using FIFO scheduling algorithm.
algo-1-dxwxx_1 | Resources requested: 0/3 CPUs, 0/0 GPUs
algo-1-dxwxx_1 | Memory usage on this node: 1.1/4.1 GB
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | == Status ==
algo-1-dxwxx_1 | Using FIFO scheduling algorithm.
algo-1-dxwxx_1 | Resources requested: 2/3 CPUs, 0/0 GPUs
algo-1-dxwxx_1 | Memory usage on this node: 1.4/4.1 GB
algo-1-dxwxx_1 | Result logdir: /opt/ml/output/intermediate/training
algo-1-dxwxx_1 | Number of trials: 1 ({'RUNNING': 1})
algo-1-dxwxx_1 | RUNNING trials:
algo-1-dxwxx_1 | - PPO_ArrivalSim-v0_0: RUNNING
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | (pid=72) 2019-08-30 09:35:13,030 WARNING ppo.py:172 -- FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
algo-1-dxwxx_1 | 2019-08-30 09:35:13,063 ERROR trial_runner.py:460 -- Error processing event.
algo-1-dxwxx_1 | Traceback (most recent call last):
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/tune/trial_runner.py", line 409, in _process_trial
algo-1-dxwxx_1 | result = self.trial_executor.fetch_result(trial)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/tune/ray_trial_executor.py", line 314, in fetch_result
algo-1-dxwxx_1 | result = ray.get(trial_future[0])
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/worker.py", line 2316, in get
algo-1-dxwxx_1 | raise value
algo-1-dxwxx_1 | ray.exceptions.RayTaskError: ray_worker (pid=72, host=b9b15d495b68)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/model.py", line 83, in __init__
algo-1-dxwxx_1 | restored, num_outputs, options)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/model.py", line 135, in _build_layers_v2
algo-1-dxwxx_1 | raise NotImplementedError
algo-1-dxwxx_1 | NotImplementedError
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | During handling of the above exception, another exception occurred:
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | ray_worker (pid=72, host=b9b15d495b68)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 276, in __init__
algo-1-dxwxx_1 | Trainable.__init__(self, config, logger_creator)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/tune/trainable.py", line 88, in __init__
algo-1-dxwxx_1 | self._setup(copy.deepcopy(self.config))
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 373, in _setup
algo-1-dxwxx_1 | self._init()
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/ppo/ppo.py", line 77, in _init
algo-1-dxwxx_1 | self.env_creator, self._policy_graph)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 506, in make_local_evaluator
algo-1-dxwxx_1 | extra_config or {}))
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/agent.py", line 714, in _make_evaluator
algo-1-dxwxx_1 | async_remote_worker_envs=config["async_remote_worker_envs"])
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/evaluation/policy_evaluator.py", line 288, in __init__
algo-1-dxwxx_1 | self._build_policy_map(policy_dict, policy_config)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/evaluation/policy_evaluator.py", line 661, in _build_policy_map
algo-1-dxwxx_1 | policy_map[name] = cls(obs_space, act_space, merged_conf)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/agents/ppo/ppo_policy_graph.py", line 176, in __init__
algo-1-dxwxx_1 | seq_lens=existing_seq_lens)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/catalog.py", line 215, in get_model
algo-1-dxwxx_1 | seq_lens)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/catalog.py", line 255, in _get_model
algo-1-dxwxx_1 | num_outputs, options)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/model.py", line 86, in __init__
algo-1-dxwxx_1 | input_dict["obs"], num_outputs, options)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/ray/rllib/models/fcnet.py", line 37, in _build_layers
algo-1-dxwxx_1 | scope=label)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/framework/python/ops/arg_scope.py", line 182, in func_with_args
algo-1-dxwxx_1 | return func(*args, **current_args)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/layers/python/layers/layers.py", line 1854, in fully_connected
algo-1-dxwxx_1 | outputs = layer.apply(inputs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 817, in apply
algo-1-dxwxx_1 | return self.__call__(inputs, *args, **kwargs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py", line 374, in __call__
algo-1-dxwxx_1 | outputs = super(Layer, self).__call__(inputs, *args, **kwargs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 730, in __call__
algo-1-dxwxx_1 | self._assert_input_compatibility(inputs)
algo-1-dxwxx_1 | File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/base_layer.py", line 1493, in _assert_input_compatibility
algo-1-dxwxx_1 | str(x.shape.as_list()))
algo-1-dxwxx_1 | ValueError: Input 0 of layer default/fc1 is incompatible with the layer: : expected min_ndim=2, found ndim=1. Full shape received: [None]
algo-1-dxwxx_1 |
algo-1-dxwxx_1 | 2019-08-30 09:35:13,064 INFO ray_trial_executor.py:178 -- Destroying actor for trial PPO_ArrivalSim-v0_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
algo-1-dxwxx_1 | 2019-08-30 09:35:13,076 INFO trial_runner.py:497 -- Attempting to recover trial state from last checkpoint.
algo-1-dxwxx_1 | (pid=72) 2019-08-30 09:35:13,041 INFO policy_evaluator.py:278 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
Я не уверен, как изменить ввод, который среда дает модели, или саму настройку моделей. Похоже, что документация не совсем ясна. У меня есть подозрение, что проблема связана с областями наблюдения и действия.
Вот ссылка на исходный пример проекта aws: https://github.com/awslabs/amazon-sagemaker-examples/tree/master/reinforcement_learning/rl_roboschool_ray