| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
|
|
| |
|
|
| import tensorflow as tf |
| import numpy as np |
| import gym, time, random, threading |
| from keras.callbacks import TensorBoard |
| from keras.models import * |
| from keras.layers import * |
| from keras import backend as K |
|
|
| from tcl_env_dqn_1 import * |
| print("after import") |
| import os |
|
|
|
|
|
|
| |
| path = os.getcwd() |
| |
| MODELS_DIRECTORY = path + '/success1' |
|
|
|
|
| |
| NAME= "A3C++logs/A3C++{}".format(int(time.time())) |
| |
| |
| RUN_TIME = 5000 |
| THREADS = 16 |
| OPTIMIZERS = 2 |
| THREAD_DELAY = 0.000001 |
| |
| N_STEP_RETURN = 15 |
| GAMMA = 1.0 |
| GAMMA_N = GAMMA ** N_STEP_RETURN |
| |
| EPS_START = .5 |
| EPS_STOP = .001 |
| EPS_DECAY = 5e-6 |
| |
| MIN_BATCH = 200 |
| TR_FREQ = 100 |
| |
| LOSS_V = 0.4 |
| LOSS_ENTROPY = 1.0 |
| |
| max_reward = -100.0 |
| |
| TRAINING_ITERATIONS = 1 |
| LEARNING_RATE = 1e-3 |
| |
| |
| |
| class Brain: |
| |
| train_queue = [[], [], [], [], []] |
| train_queue_copy = [[], [], [], [], []] |
| lock_queue = threading.Lock() |
|
|
| def __init__(self, **kwargs): |
| self.env = kwargs.get("environment") |
| self.learning_rate = kwargs.get('learning_rate', LEARNING_RATE) |
| self.tr_freq = kwargs.get('training_frequency', TR_FREQ) |
| self.min_batch = kwargs.get('min_batch', MIN_BATCH) |
| self.gamman = kwargs.get('gamma_n', GAMMA_N) |
| self.models_directory = kwargs.get('models_directory', MODELS_DIRECTORY) |
| self.num_state = self.env.env.observation_space.shape[0] |
| self.num_tcl =self.env.env.num_tcls |
| self.num_actions= self.env.env.action_space.n |
| self.none_state=np.zeros(self.num_state) |
| tf.compat.v1.disable_eager_execution() |
| |
| |
| K.manual_variable_initialization(True) |
|
|
| self.model = self._build_model(num_state=self.num_state, num_tcls=self.num_tcl) |
| self.graph = self._build_graph(self.model) |
| |
| |
| |
| self.max_reward = max_reward |
| self.rewards = {} |
| for i in range(self.env.env.day0, self.env.env.dayn): |
| self.rewards[i] = self.max_reward |
|
|
| |
|
|
| def _build_model(self, num_state, num_tcls): |
|
|
| l_input = Input(batch_shape=(None,num_state)) |
| print('input shape') |
| print(format(l_input.shape.as_list())) |
| |
| l_input1 = Lambda(lambda x: x[:, 0:num_tcls])(l_input) |
| l_input2 = Lambda(lambda x: x[:, num_tcls:])(l_input) |
| print(self.env.env.num_tcls) |
| l_input1 = Reshape((num_tcls, 1))(l_input1) |
| l_Pool = AveragePooling1D(pool_size=num_tcls)(l_input1) |
| l_Pool = Reshape([1])(l_Pool) |
| l_dense = Concatenate()([l_Pool, l_input2]) |
| l_dense = Dense(100, activation='relu')(l_dense) |
| l_dense = Dropout(0.3)(l_dense) |
| out = Dense(self.num_actions, activation='softmax')(l_dense) |
| out_value = Dense(1, activation='linear')(l_dense) |
| model = Model(inputs=l_input, outputs=[out, out_value]) |
| model._make_predict_function() |
| return model |
|
|
| def _build_graph(self, model): |
| s_t = tf.compat.v1.placeholder(tf.float32, shape=(None, self.num_state)) |
| a_t = tf.compat.v1.placeholder(tf.float32, shape=(None, self.num_actions)) |
| r_t = tf.compat.v1.placeholder(tf.float32, shape=(None, 1)) |
| p, v = model(s_t) |
| log_prob = tf.math.log(tf.reduce_sum(input_tensor=p * a_t, axis=1, keepdims=True) + 1e-10) |
| advantage = r_t - v |
| loss_policy = -log_prob * tf.stop_gradient(advantage) |
| loss_value = LOSS_V * tf.square(advantage) |
| entropy = LOSS_ENTROPY * (tf.reduce_sum(input_tensor=p * tf.math.log(p + 1e-10), axis=1, keepdims=True)) |
| loss_total = tf.reduce_mean(input_tensor=loss_policy + loss_value + entropy) |
| optimizer = tf.compat.v1.train.RMSPropOptimizer(self.learning_rate) |
| minimize = optimizer.minimize(loss_total) |
| return s_t, a_t, r_t, minimize, loss_total |
|
|
| def optimize(self): |
| |
| if len(self.train_queue_copy[0])<self.tr_freq or len(self.train_queue_copy[0])<self.min_batch : |
| time.sleep(0) |
| return |
|
|
| with self.lock_queue: |
| if len(self.train_queue_copy[0])<self.tr_freq: |
| return |
| |
| |
| self.train_queue = random.sample(np.array(self.train_queue).T.tolist(), self.min_batch) |
| self.train_queue = np.array(self.train_queue).T.tolist() |
| s, a, r, s_, s_mask = self.train_queue_copy |
| self.train_queue_copy = [[], [], [], [], []] |
|
|
| s = np.vstack(s) |
| a = np.vstack(a) |
| r = np.vstack(r) |
| s_ = np.vstack(s_) |
| s_mask = np.vstack(s_mask) |
|
|
| if len(s) > 5 * self.min_batch: print("Optimizer alert! Minimizing batch of %d" % len(s)) |
|
|
| v = self.predict_v(s_) |
| r = r + self.gamman * v * s_mask |
|
|
| s_t, a_t, r_t, minimize, loss = self.graph |
| print("Training...") |
| |
| minimize(s,a,r) |
| |
| print("Done...") |
|
|
|
|
| |
| def train_push(self, s, a, r, s_): |
| with self.lock_queue: |
| self.train_queue[0].append(s) |
| self.train_queue[1].append(a) |
| self.train_queue[2].append(r) |
|
|
| self.train_queue_copy[0].append(s) |
| self.train_queue_copy[1].append(a) |
| self.train_queue_copy[2].append(r) |
|
|
| if s_ is None: |
| self.train_queue[3].append(self.none_state) |
| self.train_queue[4].append(0.) |
|
|
| self.train_queue_copy[3].append(self.none_state) |
| self.train_queue_copy[4].append(0.) |
| else: |
| self.train_queue[3].append(s_) |
| self.train_queue[4].append(1.) |
|
|
| self.train_queue_copy[3].append(s_) |
| self.train_queue_copy[4].append(1.) |
|
|
| def predict(self, s): |
| |
| p, v = self.model.predict(s) |
| return p, v |
|
|
| def predict_p(self, s): |
| |
| p, v = self.model.predict(s) |
| return p |
|
|
| def predict_p_vote(self, s): |
| |
| votes=[] |
| |
| for filename in os.listdir(self.models_directory): |
| if filename.endswith(".h5"): |
| |
| |
| try: |
| |
| self.model.load_weights(self.models_directory+"/"+filename) |
| |
| p = self.model.predict(s)[0][0] |
| |
| |
| votes.append(ACTIONS[np.argmax(p)]) |
| except : |
| print(filename+"didn't vote!") |
| pass |
| boosted_p = np.average(np.array(votes),axis=0) |
| return np.rint(boosted_p).astype(int) |
| |
|
|
| def predict_v(self, s): |
| |
| p, v = self.model.predict(s) |
| return v |
|
|
| |
| |
| |
| frames = 0 |
|
|
| class Agent: |
| def __init__(self, eps_start, eps_end, eps_decay, num_actions): |
| self.eps_start = eps_start |
| self.eps_end = eps_end |
| self.eps_decay = eps_decay |
| self.memory = [] |
| self.R = 0. |
| self.num_actions = num_actions |
|
|
| def getEpsilon(self): |
| return max(self.eps_start - frames * self.eps_decay,self.eps_end) |
|
|
| def act(self, s,render=False, br=None): |
| global frames, brain |
| if br != None: |
| brain = br |
| eps = self.getEpsilon() |
| frames = frames + 1 |
| |
| if random.random() < eps: |
| p = np.random.dirichlet(np.ones(self.num_actions), size=1) |
| else: |
| s = np.array([s]) |
| if render: |
| print('starting the vote') |
| a = brain.predict_p_vote(s) |
| p= np.random.dirichlet(np.ones(self.num_actions), size=1) |
| print(a) |
| return list(a),p |
| p = brain.predict_p(s) |
| |
| |
| |
| a = np.argmax(p.reshape(self.num_actions,)) |
| return a,p |
|
|
| def train(self, s, a, r, s_): |
| def get_sample(memory, n): |
| s, a, _, _ = memory[0] |
| _, _, _, s_ = memory[n - 1] |
|
|
| return s, a, self.R, s_ |
|
|
| a_cats = a |
| |
|
|
| self.memory.append((s, a_cats, r, s_)) |
|
|
| self.R = (self.R + r * GAMMA_N) / GAMMA |
| if s_ is None: |
| while len(self.memory) > 0: |
| n = len(self.memory) |
| s, a, r, s_ = get_sample(self.memory, n) |
| brain.train_push(s, a, r, s_) |
| self.R = (self.R - self.memory[0][2]) / GAMMA |
| self.memory.pop(0) |
| self.R = 0 |
|
|
| if len(self.memory) >= N_STEP_RETURN: |
| s, a, r, s_ = get_sample(self.memory, N_STEP_RETURN) |
| brain.train_push(s, a, r, s_) |
| self.R = self.R - self.memory[0][2] |
| self.memory.pop(0) |
|
|
|
|
| |
|
|
|
|
| |
| |
| |
| class Environment(threading.Thread): |
| stop_signal = False |
|
|
| def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_decay=EPS_DECAY, **kwargs): |
| threading.Thread.__init__(self) |
|
|
| self.render = render |
| self.env = MicroGridEnv(**kwargs) |
| self.agent = Agent(eps_start, eps_end, eps_decay,num_actions=self.env.action_space.n) |
| self.brain = None |
|
|
|
|
| def runEpisode(self,day=None, pplt=True, web = False): |
| |
| if web==False: |
| s = self.env.reset_all(day=day) |
| else: |
| s = self.env.reset(day=day) |
| R = 0 |
| while True: |
| time.sleep(THREAD_DELAY) |
| |
| a, p = self.agent.act(s,self.render, self.brain) |
| |
| s_, r, done, _ = self.env.step(a) |
| R += r |
| |
| if self.render: |
| self.env.render(R) |
|
|
| if done: |
| s_ = None |
|
|
| if not self.render: |
| aa = np.zeros(shape=(NUM_ACTIONS,)) |
| aa[a] = 1 |
| self.agent.train(s, aa, r, s_) |
| s = s_ |
|
|
| if done: |
| break |
| print("episode has been ran") |
| print(R) |
| if web==False: |
| REWARDS[self.env.day].append(R) |
|
|
| if self.render: |
| return R |
| if R > brain.rewards[self.env.day] and self.agent.getEpsilon()<0.2: |
| print('new max found: '+str(R)) |
| print("-------------------------------------------------------------------------------------------------") |
| try: |
| |
| writer = tf.compat.v1.summary.FileWriter(NAME, brain.session.graph) |
| brain.model.save(MODELS_DIRECTORY+"/A3C++" + str(self.env.day) + ".h5") |
| print("Model saved") |
| except: |
| pass |
| brain.rewards[self.env.day] = R |
|
|
|
|
| def run(self): |
| while not self.stop_signal: |
| self.runEpisode() |
|
|
| def stop(self): |
| self.stop_signal = True |
|
|
|
|
| |
| class Optimizer(threading.Thread): |
| stop_signal = False |
|
|
| def __init__(self): |
| threading.Thread.__init__(self) |
|
|
| def run(self): |
| while not self.stop_signal: |
| brain.optimize() |
|
|
| def stop(self): |
| self.stop_signal = True |
|
|
|
|
|
|
|
|
| if __name__ =="__main__": |
| import sys |
| TRAIN=False |
|
|
| |
| |
| |
|
|
| DAY0 = 0 |
| DAYN = 10 |
|
|
| REWARDS = {} |
| for i in range(DAY0,DAYN): |
| REWARDS[i]=[] |
|
|
| env_test = Environment(render=True, eps_start=0., eps_end=0., day0=DAY0, dayn=DAYN, iterations=24) |
| NUM_STATE = env_test.env.observation_space.shape[0] |
| NUM_ACTIONS = env_test.env.action_space.n |
| NONE_STATE = np.zeros(NUM_STATE) |
|
|
| brain = Brain(environment=env_test) |
|
|
| if TRAIN: |
|
|
| envs = [Environment(day0=DAY0, dayn=DAYN) for i in range(THREADS)] |
| opts = [Optimizer() for i in range(OPTIMIZERS)] |
| t0=time.time() |
|
|
| for o in opts: |
| o.start() |
|
|
| for e in envs: |
| e.start() |
|
|
| time.sleep(RUN_TIME) |
|
|
| for e in envs: |
| e.stop() |
| for e in envs: |
| e.join() |
|
|
| for o in opts: |
| o.stop() |
| for o in opts: |
| o.join() |
| brain.model.save("success00/A3C++" + ".h5") |
| print("Training finished") |
| print('training_time:', time.time()-t0) |
| |
| import pickle |
| with open("REWARDS_A3C++train.pkl", 'wb') as f: |
| pickle.dump(REWARDS, f, pickle.HIGHEST_PROTOCOL) |
|
|
|
|
| try: |
| for day in range(DAY0,DAYN): |
| env_test.runEpisode(day) |
| print("average reward: ",np.average([list(REWARDS[i])[-1] for i in range(DAY0,DAYN)])) |
| import pickle |
| |
| |
| except NameError: |
| print(NameError) |
|
|
|
|
|
|
|
|