Singapore has one of the highest smartphone penetration rates worldwide, at 149.2% (Sept 2017). In 2017 alone, we consumed over 14 Petabytes of mobile data. On average, each person would have consumed 25 GB data/year! Telecoms have since taken note of the demand and have began aggressively pushing out mobile plans with more data, albeit charging a higher price.
With this in mind, I set out to create a Telegram Bot to help consumers decide which plan is best suited for their needs.
from bs4 import BeautifulSoup
import pandas as pd
import re
import datetime
from urllib.request import Request, urlopen
# Before we begin scrapping the site, lets define a function to clean the data.
# For parsing the data
def clean_text(s):
# Check for two exceptions - Unlimited and empty
if "Unlimited" in s:
num = 10000
return num
if s == "":
num = ""
return num
# Otherwise clean the string and split into cases
new_s = re.sub("[^0-9+.]","",s)
# For "3 + 2 GB" Case
if new_s == "300+1":
num = 1.3
# For empty case
elif new_s == "":
num = new_s
# For unlimited
else:
# For X GB + Y GB
if "+" in new_s:
num = float(eval(new_s))
# For the case 300MB
elif new_s == "300":
num = 0.3
else:
num = float(new_s)
return num
M1's web store is relatively simple to scrape data from. We begin by opening and reading the site.
site = urlopen("https://www.m1.com.sg/personal/mobile/phones/filters/all-plans/all/all/0/1500/0/0/none")
soup = BeautifulSoup(site.read(),"html.parser")
links=[]
Next we extract all the links (each containing a single phone model) from the site and store it in a list.
for link in soup.find_all("a", { "class" : "light-blue hidetag" }):
new_link = link['href']
new_link_split = new_link.split()
final_link = ""
for i in range(0,len(new_link_split)):
if i < len(new_link_split) - 1:
final_link += new_link_split[i] + '%20'
else:
final_link += new_link_split[i]
links.append(final_link)
We then phase each link to fill our 8 main columns. This format will also be used to store information from the other three telecoms.
df_contract = pd.DataFrame(columns=('Provider','Phone','Plan','TalkTime(Mins)','SMS/MMS','Data(GB)','PayNow($)','PerMonth($)'))
# Going through of each of the phone links
for link in links:
site2 = urlopen("https://www.m1.com.sg" + link)
soup2 = BeautifulSoup(site2.read(),"html.parser")
df_model = pd.DataFrame(columns=('Provider','Phone','Plan','TalkTime(Mins)','SMS/MMS','Data(GB)','PayNow($)','PerMonth($)'))
name = soup2.find("div",{"class":"title"})
# Get the model name
model = name.get_text()
# Remove whitespaces and \ characters
model = re.sub('\s+',' ',model)
plans = []
# Plan name
for div in soup2.find_all("div", { "class" : "title color-orange font-size-14 font-weight-bold" }):
plans.append(div.get_text())
details = []
import csv
# TalkTime, SMS/MMS, Data
for div in soup2.find_all("div", { "class" : "desc font-size-14" }):
num = clean_text(div.get_text())
details.append(num)
price1 = []
# Pay Now
for div in soup2.find_all("div", { "class" : "font-size-15 line-height-20 color-orange font-weight-bold" }):
num = clean_text(div.get_text())
price1.append(num)
# Per Month
price2 = []
for div in soup2.find_all("div", { "class" : "font-size-15 line-height-20 color-3" }):
num = clean_text(div.get_text())
price2.append(num)
# Store the data in a dataframe (by rows)
for i in range(0,len(plans)):
# print(i)
# This would be price2 is missing (no monthly installments)
if plans[i] == 'Equipment Only':
df_model.loc[i] = ['M1',model,plans[i],details[3*i],details[3*i + 1],details[3*i + 2],price1[i],'']
else:
df_model.loc[i] = ['M1',model,plans[i],details[3*i],details[3*i + 1],details[3*i + 2],price1[i],price2[i]]
# Add our data frame for this model to the full dataframe
# print('Dataframe done.')
df_contract = df_contract.append(df_model)
df_contract.sample(n=10)
Success! We got all the avaliable phone models for M1 as well as their relevant information.
A telegram bot wrapper was released recently, making it much easier
The legacy code can still be viewed at
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
import logging
# Enable logging (for troubleshooting)
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.INFO)
logger = logging.getLogger(__name__)
Next, we will define a few command handlers. These form the basis of almost every bot written in the telegram bot python wrapper.
token = '---YOUR BOT TOKEN HERE---'
def start(bot, update):
"""Send a message when the command /start is issued."""
update.message.reply_text('Welcome to Telcobot! Choose one of the three options to begin '\
'the search for your ideal mobile plan!')
keyboard = [[InlineKeyboardButton("Search by model", callback_data='1'),
InlineKeyboardButton("Search by data", callback_data='2')],
[InlineKeyboardButton("Search by price", callback_data='3')]]
reply_markup = InlineKeyboardMarkup(keyboard)
update.message.reply_text('Please choose:', reply_markup=reply_markup)
def error(bot, update, error):
"""Log Errors caused by Updates."""
logger.warning('Update "%s" caused error "%s"', update, error)
def main():
"""Start the bot."""
# Create the EventHandler and pass it your bot's token.
updater = Updater(token)
# Get the dispatcher to register handlers
dp = updater.dispatcher
# on different commands - answer in Telegram
dp.add_handler(CommandHandler('start', start))
# log all errors
dp.add_error_handler(error)
# Start the Bot
updater.start_polling()
# Run the bot until you press Ctrl-C or the process receives SIGINT,
# SIGTERM or SIGABRT. This should be used most of the time, since
# start_polling() is non-blocking and will stop the bot gracefully.
updater.idle()
if __name__ == '__main__':
main()