SmartEngine  1.6.0
PPOTrainer.h
1 // Copyright (C) Entropy Software LLC - All Rights Reserved
2 
3 #pragma once
4 
5 #include "CuriosityModule.h"
6 #include "Object.h"
7 #include "RLTrainer.h"
8 
9 namespace SmartEngine
10 {
11 
12 #pragma pack(push, 4)
13 struct PPOTrainerCInfo : RLTrainerCInfo
17 {
22  IGraph* graph = nullptr;
23 
27  ICuriosityModule* curiosityModule = nullptr; // Optional
28 
33  const char* valueNodeName = "";
34 
38  float valueCoefficient = 1.0f;
39 
45  float entropyCoefficient = 0.01f;
46 
50  float policyClipEpsilon = 0.2f;
51 
55  float gaeLambda = 0.95f;
56 
60  int trajectorySize = 2048;
61 
65  int batchSize = 32;
66 
70  int epochCount = 10;
71 
77  bool normalizeAdvantage = true;
78 };
79 #pragma pack(pop)
80 
111 class SMARTENGINE_EXPORT IPPOTrainer : public IRLTrainer
112 {
113 public:
114  SMARTENGINE_DECLARE_CLASS(IPPOTrainer)
115 
116 
117  virtual float GetPolicyLoss() = 0;
120 
124  virtual float GetValueLoss() = 0;
125 
129  virtual float GetEntropyLoss() = 0;
130 };
131 
135 SMARTENGINE_EXPORT ObjectPtr<IPPOTrainer> CreatePPOTrainer(const PPOTrainerCInfo& cinfo);
136 
138 extern "C"
139 {
140  SMARTENGINE_EXPORT ObjPtr PPOTrainer_CreateInstance(const PPOTrainerCInfo& cinfo);
141  SMARTENGINE_EXPORT float PPOTrainer_GetPolicyLoss(ObjPtr object);
142  SMARTENGINE_EXPORT float PPOTrainer_GetValueLoss(ObjPtr object);
143  SMARTENGINE_EXPORT float PPOTrainer_GetEntropyLoss(ObjPtr object);
144 }
146 
147 } // namespace SmartEngine
SmartEngine::PPOTrainerCInfo::valueNodeName
const char * valueNodeName
The name of the output of the critic node. This node should be a linear layer with one output neuron ...
Definition: PPOTrainer.h:33
SmartEngine::PPOTrainerCInfo::batchSize
int batchSize
How many rows of data we should train in a single batch.
Definition: PPOTrainer.h:65
SmartEngine::PPOTrainerCInfo::valueCoefficient
float valueCoefficient
How much the value contributes to the loss
Definition: PPOTrainer.h:38
SmartEngine::PPOTrainerCInfo::policyClipEpsilon
float policyClipEpsilon
Range (percent) that we allow the policy to change within in one step.
Definition: PPOTrainer.h:50
SmartEngine::PPOTrainerCInfo::epochCount
int epochCount
How many times we should train over the trajectory.
Definition: PPOTrainer.h:70
SmartEngine::PPOTrainerCInfo::gaeLambda
float gaeLambda
Multiplier on top of gamma.
Definition: PPOTrainer.h:55
SmartEngine::PPOTrainerCInfo::curiosityModule
ICuriosityModule * curiosityModule
Optional curiosity module for additional exploration rewards
Definition: PPOTrainer.h:27
SmartEngine::IRLTrainer
Base class for all reinforcement learning trainers.
Definition: RLTrainer.h:69
SmartEngine::PPOTrainerCInfo::entropyCoefficient
float entropyCoefficient
How much entropy contributes to the loss. Entropy is a measure of how random our output is....
Definition: PPOTrainer.h:45
SmartEngine::PPOTrainerCInfo::trajectorySize
int trajectorySize
How many rows of data we should wait for before training
Definition: PPOTrainer.h:60
SmartEngine::ObjectPtr
Smart pointer to an IObject. Automatic ref counting.
Definition: ObjectPtr.h:16
SmartEngine::IPPOTrainer
The PPO Trainer is a reinforcement learning trainer that is composed of two parts: an actor sub graph...
Definition: PPOTrainer.h:112
SmartEngine
Definition: A2CTrainer.h:10
SmartEngine::PPOTrainerCInfo
Data used to construct an IPPOTrainer instance
Definition: PPOTrainer.h:17
SmartEngine::IPPOTrainer::GetEntropyLoss
virtual float GetEntropyLoss()=0
Returns the entropy loss - a measure of how random the network is.
SmartEngine::ICuriosityModule
A curiosity module is a way of rewarding an agent for behavior not yet seen. Rewards are given based ...
Definition: CuriosityModule.h:108
SmartEngine::IGraph
A graph is a collection of buffers and nodes that together form a neural network. The graph is create...
Definition: Graph.h:61
SmartEngine::CreatePPOTrainer
SMARTENGINE_EXPORT ObjectPtr< IPPOTrainer > CreatePPOTrainer(const PPOTrainerCInfo &cinfo)
Creates an instance of IPPOTrainer
SmartEngine::IPPOTrainer::GetValueLoss
virtual float GetValueLoss()=0
Returns the loss in value sub-graph
SmartEngine::PPOTrainerCInfo::normalizeAdvantage
bool normalizeAdvantage
If true, the advantage (actual reward - expected reward) is normalized by subtracting the mean and di...
Definition: PPOTrainer.h:77
SmartEngine::PPOTrainerCInfo::graph
IGraph * graph
The graph we are training. This should contain the policy network and value network.
Definition: PPOTrainer.h:22