advclip/dataset/make_nuswide.py

108 lines
3.1 KiB
Python

import os
import scipy.io as scio
import numpy as np
# mkdir mat
# mv make_nuswide.py mat
# python make_nuswide.py
root_dir = "PATH/TO/YOUR/DOWNLOAD/DIR/"
imageListFile = os.path.join(root_dir, "/Low-Level-Features/ImageList/Imagelist.txt")
labelPath = os.path.join(root_dir, "/nuswide/Groundtruth/AllLabels")
textFile = os.path.join(root_dir, "/Low-Level-Features/NUS_WID_Tags/All_Tags.txt")
classIndexFile = os.path.join(root_dir, "/Low-Level-Features/Concepts81.txt")
# you can use the image urls to download images
imagePath = os.path.join(root_dir, "nuswide/Flickr")
with open(imageListFile, "r") as f:
indexs = f.readlines()
indexs = [os.path.join(imagePath, item.strip().replace("\\", "/")) for item in indexs]
print("indexs length:", len(indexs))
#class_index = {}
#with open(classIndexFile, "r") as f:
# data = f.readlines()
#
#for i, item in enumerate(data):
# class_index.update({item.strip(): i})
captions = []
with open(textFile, "r") as f:
for line in f:
if len(line.strip()) == 0:
print("some line empty!")
continue
caption = line.split()[1:]
caption = " ".join(caption).strip()
if len(caption) == 0:
caption = "123456"
captions.append(caption)
print("captions length:", len(captions))
#labels = np.zeros([len(indexs), len(class_index)], dtype=np.int8)
# label_lists = os.listdir(labelPath)
with open(os.path.join(root_dir, "/nuswide/Groundtruth/used_label.txt")) as f:
label_lists = f.readlines()
label_lists = [item.strip() for item in label_lists]
class_index = {}
for i, item in enumerate(label_lists):
class_index.update({item: i})
labels = np.zeros([len(indexs), len(class_index)], dtype=np.int8)
for item in label_lists:
path = os.path.join(labelPath, item)
class_label = item# .split(".")[0].split("_")[-1]
with open(path, "r") as f:
data = f.readlines()
for i, val in enumerate(data):
labels[i][class_index[class_label]] = 1 if val.strip() == "1" else 0
print("labels sum:", labels.sum())
not_used_id = []
with open(os.path.join(root_dir, "/nuswide/Groundtruth/not_used_id.txt")) as f:
not_used_id = f.readlines()
not_used_id = [int(item.strip()) for item in not_used_id]
# for item in not_used_id:
# indexs.pop(item)
# captions.pop(item)
# labels = np.delete(labels, item, 0)
ind = list(range(len(indexs)))
for item in not_used_id:
ind.remove(item)
indexs[item] = ""
captions[item] = ""
indexs = [item for item in indexs if item != ""]
captions = [item for item in captions if item != ""]
ind = np.asarray(ind)
labels = labels[ind]
# ind = range(len(indexs))
print("indexs length:", len(indexs))
print("captions length:", len(captions))
print("labels shape:", labels.shape)
indexs = {"index": indexs}
captions = {"caption": captions}
labels = {"category": labels}
scio.savemat(os.path.join(root_dir, "/mat/index.mat"), indexs)
# scio.savemat("caption.mat", captions)
scio.savemat(os.path.join(root_dir, "/mat/label.mat"), labels)
captions = [item + "\n" for item in captions["caption"]]
with open(os.path.join(root_dir, "/mat/caption.txt"), "w") as f:
f.writelines(captions)
print("finished!")