-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathenvironments.py
135 lines (97 loc) · 3.89 KB
/
environments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import numpy as np
class environment:
"""
Environment super-class
Represents environment for the agent
The environment must be able to be represented as states
There should be a set of possible actions to take on the environment
It should return rewards when we transition between timesteps
"""
def __init__(self, data):
"""
initializes the environment with some data that can represent it
"""
self.data = data
def get_data(self):
"""
returns the data that primarily represents this environment
"""
return self.data
def get_obs_space(self):
raise NotImplementedError
def get_action_space(self):
raise NotImplementedError
def take_action(self):
raise NotImplementedError
def get_first_state(self):
raise NotImplementedError
def reset(self):
raise NotImplementedError
class time_environment(environment): # might need to think of a better name
"""
The data-set should be a 2D array:
row # represents the timestep
column # represents the action
number in [i,j] is the value of choosing action j at timestep i
e.g.:
would represent a 3-timestep progression with 4 possible actions
data = [[ _, _, _, _],
[ _, _, _, _],
[ _, _, _, _]]
"""
def __init__(self, data, final_reward_subtraction = 3):
super().__init__(data) # data needs to be a 2D matrix
self.final_subtract = final_reward_subtraction # how much to subtract for missing an object
self.action_space = len(data[0]) # action space = number of objects
self.obs_space = len(data) # observation space = number of timesteps
self.timestep = 0 # the timestep, aka our state
self.collection = np.zeros(shape = [self.action_space], dtype = np.int32)
def take_action(self, action):
"""
updates our state (not the actual network input, just what actions we have aquired to this point)
returns a reward based on the number in the 'data' matrix
also returns a final reward based on diversity of objects
returns our state (the timestep), the reward, and whether or not the episode is over
"""
# get the reward for the value in the 'data' matrix
# we're going to call this VALUE REWARD
value_reward = super().get_data()[self.timestep][action]
diversity_reward = 0
# update our list of what objects we do and don't have as of now in this episode
if self.collection[action]==0:
self.collection[action] += 1
# check if we are on the last timestep
if self.timestep+1 == self.obs_space:
done = True
self.timestep = 0
# calculate our final reward based on diversity only when the episode ends
# we're going to call this DIVERSITY REWARD
for val in self.collection:
if val == 0:
diversity_reward -= self.final_subtract # deduct a certain value for every object not ever chosen
self.collection = np.zeros(shape = [self.action_space], dtype = np.int32)
else:
done = False
self.timestep += 1
reward = value_reward + diversity_reward
return self.timestep, reward, done
def get_obs_space(self):
"""
returns the number of states we can see, aka number of timesteps
"""
return self.obs_space
def get_action_space(self):
"""
represents the number of actions we can take, aka number of objects we observe
"""
return self.action_space
def get_first_state(self):
"""
returns the first timestep (0)
"""
return 0
def reset(self):
"""
resets the timestep to 0
"""
self.timestep = 0