Matrices de poids «nan» tflearn
-
31-10-2019 - |
Question
Je voulais construire un DQN. Alors j'ai suivi ça code et j'ai regardé quelques vidéos sur l'idée de DQN. Mon code est le that (le mien est écrit en tfrearn et à son keras):
import tflearn as tfl
import numpy as np
import gym
from collections import deque
import random
class DeepQ():
def __init__(self,game="SpaceInvaders-v0"):
self.game=game
self.env=gym.make(game)
self.storage=deque()
self.filter_size=[4,4]
self.itertime=1000
self.random_move_prop=0.8
np.random.seed(1)
self.minibatch_size=250
self.discounted_future_reward=0.9
def Q_Network(self,learning_rate=0.0000001,load=False,model_path=None,checkpoint_path="X://xxx//xxx//Documents//GitHub//Deeplearning_for_starters//Atari_modells//checkpoint.ckpt"):
if load==False:
net=tfl.layers.core.input_data(shape=[None,210,160,3])# rework this stuff
net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation='relu')
net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation="relu")
#net=tfl.layers.fully_connected(net,20,activation="relu")
net=tfl.layers.flatten(net)
#net=tfl.layers.fully_connected(net,18,activation="relu")
net=tfl.layers.fully_connected(net,10,activation='relu')
net=tfl.layers.fully_connected(net,self.env.action_space.n,activation="linear")
net=tfl.layers.estimator.regression(net,learning_rate=learning_rate)
self.modell=tfl.DNN(net,checkpoint_path=checkpoint_path)
else:
net=tfl.layers.core.input_data(shape=[None,210,160,3])
net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation='relu')
net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation="relu")
#net=tfl.layers.fully_connected(net,20,activation="relu")
net=tfl.layers.flatten(net)
#net=tfl.layers.fully_connected(net,18,activation="relu")
net=tfl.layers.fully_connected(net,10,activation='relu')
net=tfl.layers.fully_connected(net,self.env.action_space.n,activation="linear")
net=tfl.layers.estimator.regression(net,learning_rate=learning_rate)
self.modell=tfl.DNN(net)
self.modell.load(model_path,weights_only=True)
def Q_Learning(self):
observation=self.env.reset()
for i in range(self.itertime):
#self.env.render()
observation=observation.reshape(1,210,160,3)
if np.random.rand()<=self.random_move_prop:
#print("Random step")
action=np.random.randint(low=0,high=self.env.action_space.n)
else:
#print("Random prediction") #for debugging usefull
action=self.modell.predict(observation)
action=np.argmax(action)
new_observation, reward, done, info=self.env.step(action)
self.storage.append((observation,action,reward,new_observation,done))
observation=new_observation
if done:
self.env.reset()
print("###############################################")
print("Done with observing!")
print("###############################################")
minibatch=random.sample(self.storage,self.minibatch_size)# take random observations from our data
x=np.zeros((self.minibatch_size,)+observation.shape)
y=np.zeros((self.minibatch_size,self.env.action_space.n))
for i in range(0,self.minibatch_size):
Observation=minibatch[i][0]
Action=minibatch[i][1]
Reward=minibatch[i][2]
New_observation=minibatch[i][3]
done=minibatch[i][4]
print("Processing batch data... (step:"+str(i)+" from "+str(self.minibatch_size)+")")
x[i:i+1]=Observation.reshape((1,)+observation.shape)
y[i]=self.modell.predict(Observation)
Q_sa=self.modell.predict(Observation)
if done:
y[i,action]=reward
else:
y[i,action]=reward+self.discounted_future_reward*np.max(Q_sa)
self.modell.fit_batch(x,y)
self.modell.save("X://xxx//xxx//xxx//SpaceInvaders1.tfl")
print("")
print("Modell fitting acomplished!")
print("")
def Q_predict(self,model_path="Your path here"):
self.Q_Network(load=True,model_path=model_path)
observation=self.env.reset()
observation=observation.reshape((1,)+observation.shape)
done=False
total_reward=0.0
while not done:
self.env.render()
Q=self.modell.predict(observation)
print(Q)
action=np.argmax(Q)
print(action)
new_observation,reward,done,info=self.env.step(action)
observation=new_observation
observation=new_observation.reshape((1,)+observation.shape)
total_reward+=reward
print("Game ends with a score of: "+str(total_reward))
print("")
Le problème est que si j'exécute la fonction de prédiction, le réseau ne fait rien. J'ai compris que Tous les poids sont remplis de nan
. Ce que j'ai lu, c'est qu'il peut dépendre du taux d'apprentissage, donc j'ai abaissé le taux de 1e-3
Pour le réel, mais cela n'a rien changé.
Pas de solution correcte
Licencié sous: CC-BY-SA avec attribution
Non affilié à datascience.stackexchange