视觉问答模型

视觉问答模型

'''
from keras.layers import Conv2D, MaxPooling2D, Flatten, Concatenate
from keras.layers import Input, LSTM, Embedding, Dense, merge
from keras.models import Model, Sequential

vision_model = Sequential()
vision_model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(224, 224, 3)))
vision_model.add(Conv2D(64, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
vision_model.add(Conv2D(128, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Conv2D(256, (3, 3), activation='relu', padding='same'))
vision_model.add(Conv2D(256, (3, 3), activation='relu'))
vision_model.add(Conv2D(256, (3, 3), activation='relu'))
vision_model.add(MaxPooling2D((2, 2)))
vision_model.add(Flatten())

now let's get a tensor with the output of our vision model:

image_input = Input(shape=(224, 224, 3))
encoded_image = vision_model(image_input)
question_input = Input(shape=(100,), dtype='int32')
embedded_question = Embedding(input_dim=10000,output_dim=256, input_length=100)(question_input)
encoded_question = LSTM(256)(embedded_question)
merged = Concatenate(axis=-1)([encoded_question, encoded_image])
output = Dense(1000, activation='softmax')(merged)
vqa_model = Model(inputs=[image_input, question_input], outputs=output)

print(vqa_model.summary())
'''

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。