This project is a realization of the ideas I came up with during my time at Event Galaxy, where I spent a month as a website manager.
The original intent was to create an application that would allow a user to provide an image of an advertising inflatable, either obtained online or captured live. The application would then classify this inflatable and redirect the customer to its matching product page on our website.
In our line of work, customers are often confused between various advertising inflatables avaliable and thus have difficulty communicating their ideas or vision to us.
A huge thanks to my director for allowing me to use the images on the site.
The main products & services page contain links for all the products.
We will start by scraping the urls for each product page. Begin by importing the necessary modules.
import requests
from bs4 import BeautifulSoup
url = "http://www.eventgalaxy.com.sg/products/lighted-balloon-stands/"
request = requests.get(url)
soup = BeautifulSoup(request.text,features="html.parser")
products = soup.find_all("li",{"class":"page_item"})
for product in products:
print(product.find("a")['href'])
By inspecting the various website elements on Chrome, we are able to find the necessary tags to obtain the required urls.
Opening up a specific product page and scrolling down reveals several categories for this particular product.
To obtain the url for each category, it is slightly trickier.
url = "http://www.eventgalaxy.com.sg/products/lighted-balloon-stands/"
request = requests.get(url)
soup = BeautifulSoup(request.text,features="html.parser")
start_at = soup.find("li",{"class":"page_item page-item-1742"})
shortened_soup = start_at.find_all_previous("li")
for category in shortened_soup:
try:
categoryUrl = category.find("a")['href']
print(categoryUrl)
except:
continue
We make the following amendments to the code to skip the irrelevant links.
for category in shortened_soup:
try:
categoryUrl = category.find("a")['href']
if categoryUrl == "http://www.eventgalaxy.com.sg":
break
print(categoryUrl)
except:
continue
Upon clicking a specific category, we are treated to all the images under this category. This is what we want to scrape for our product classification.
The code snippet below will scrape the first image, for illustration. In the actual code used (download_images.py), we download the images to our hard disk instead.
import matplotlib.pyplot as plt
import requests
from PIL import Image
from io import BytesIO
%matplotlib inline
url = "http://www.eventgalaxy.com.sg/products/lighted-balloon-stands/features/property-launch-events/"
request = requests.get(url)
soup = BeautifulSoup(request.text,features="html.parser")
imgs = soup.find_all("div",{"class":"productCatPhoto"})
for img in imgs:
img = img.find("img")
im = Image.open(BytesIO(requests.get(img["src"]).content))
plt.imshow(im)
plt.show()
break
By merging the three scripts (find_urls.py, find_categories.py and download_images.py) together (download_all_images.py), we can download all the images on the site in an efficient and orderly manner.
That's the end for the scraping component of the project. Next up: Product Classification Using Convolutional Neural Networks!
import cv2
import numpy as np
import os
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
class_dict = {}
with open("classes5.txt","r") as txt:
classes = [line.strip() for line in txt.readlines()]
class_dict = dict(zip(range(len(classes)),classes))
def loadImages(root,txt):
with open(txt,"r") as f:
all_imgs = []
all_classes = []
for line in f.readlines():
imgname = line.split(" ")[0]
img_arr = cv2.imread(root + imgname)
img_arr = img_arr[:,:,::-1]
img_arr = cv2.resize(img_arr,(128,128))
all_imgs.append(img_arr)
label = int(line.split(" ")[1].strip("\n"))
all_classes.append(label)
return np.array(all_imgs),np.array(all_classes)
train_root = "D:\\Projects\\train\\"
test_root = "D:\\Projects\\test\\"
X_train,y_train = loadImages(train_root,"D:\\Projects\Advertising\\train.txt")
X_test,y_test = loadImages(test_root,"D:\\Projects\\Advertising\\test.txt")
# Normalise pixel values to be between 0 and 1
X_train, X_test = X_train/255.0, X_test/255.0
print(X_train.shape)
plt.figure(figsize=(10,10))
for i in range(25):
plt.subplot(5,5,i+1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.imshow(X_train[i], cmap=plt.cm.binary)
plt.xlabel(class_dict[y_train[i]])
plt.show()
import tensorflow as tf
from tensorflow.keras import layers, models
model = models.Sequential()
model.add(layers.Conv2D(32,(3,3), activation = "relu", input_shape =(128,128,3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))
model.summary()
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=100, batch_size = 32)
# summarize history for accuracy
plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
model = models.Sequential()
model.add(layers.Conv2D(32,(3,3), activation = "relu", input_shape =(128,128,3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=100, batch_size = 32)
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
model = models.Sequential()
model.add(layers.Flatten(input_shape =(128,128,3)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=100, batch_size = 32)
test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)
predictions = model.predict(X_test)
def convertPredToLabels(predictions):
labels = []
for i in range(0,len(predictions)):
labels.append(np.argmax(predictions[i]))
return labels
pred_labels = convertPredToLabels(predictions)
def tabulateResults(pred_labels, actual_labels):
counter = dict(zip(range(len(classes)),np.zeros(10)))
for i in range(0,len(pred_labels)):
if pred_labels[i] == actual_labels[i]:
counter[pred_labels[i]]+=1
return counter
tabulateResults(pred_labels,y_test)
unique, counts = np.unique(y_test, return_counts=True)
print (np.asarray((unique, counts)).T)