import sys
assert sys.version_info.major == 3 and sys.version_info.minor == 7, 'Sorry. 3.7 or 3.8 only. f-strings.'
%%javascript
//hack to fix export
require.config({
paths: {
d3: 'https://cdnjs.cloudflare.com/ajax/libs/d3/5.9.2/d3',
jquery: 'https://code.jquery.com/jquery-3.4.1.min',
plotly: 'https://cdn.plot.ly/plotly-latest.min'
},
shim: {
plotly: {
deps: ['d3', 'jquery'],
exports: 'plotly'
}
}
});
import requests, re, csv, pickle
import pandas as pd
import numpy as np
import pickle
import re
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode()
class Wiki():
api = "https://en.wikipedia.org/w/api.php"
def __init__(self, category,
no_views=False,
no_content=False,
custom_page_parser=None,
extra_fields=None,
forbidden_categories_keywords=None):
"""
custom_page_parser is for content mining. a function that given wiki text returns a dictionary of whatever it mined.
Any extra fields need to be be added to extra_fields or to_csv will fail.
"""
self.session = requests.Session()
self.no_views = no_views
self.no_content = no_content
self.data = {}
self.category = category
self.category_cleaned = category.replace(' ','_').replace('Category:','')
if custom_page_parser:
self.custom_page_parser = custom_page_parser
else:
self.custom_page_parser = lambda text: {}
if extra_fields:
self.extra_fields = extra_fields
else:
self.extra_fields = []
if forbidden_categories_keywords:
self.forbidden_categories_keywords = forbidden_categories_keywords
else:
self.forbidden_categories_keywords = []
def get(self, params):
data = self.session.get(url=self.api, params=params).json()
if 'continue' in data:
params['cmcontinue'] = data['continue']['cmcontinue']
t = list(data['query'].keys())[0]
new_data = self.get(params)
new_data['query'][t] = [*data['query'][t], *new_data['query'][t]]
data = new_data
return data
def add_datum(self, data, cat):
for d in data:
name = d["title"]
if name not in self.data:
self.data[name] = d
self.data[name]['category'] = cat
if not self.no_views:
self.data[name]['views'] = self.get_pageviews(name)
if not self.no_content:
wiki = self.get_content(name)
for key, value in self.custom_page_parser(wiki).items():
self.data[name][key] = value
else:
self.data[name]["category"] += '|' + cat
def get_subcategories(self, cat):
return [subcat for subcat in self.get_members(cat, 'subcat')
if all([k not in subcat['title'].lower() for k in self.forbidden_categories_keywords])]
def get_pages(self,cat):
return self.get_members(cat, 'page')
def get_members(self, cat, cmtype='subcat|page'):
params = {
'action': "query",
'list': "categorymembers",
'cmtitle': cat,
'cmtype': cmtype,
'cmdir': "desc",
'format': "json"
}
r = self.get(params)
if 'query' not in r:
print(f'{cat} replied with {str(r)}.')
return []
data = r['query']['categorymembers']
self.add_datum(data, cat)
return data
def get_pages_recursively(self, cat=None):
if cat is None:
cat = self.category
subcats = [s['title'] for s in self.get_subcategories(cat)]
data = self.get_pages(cat)
for c in subcats:
ndata = self.get_pages_recursively(c)
print(c, len(data),len(ndata))
data.extend(ndata)
return data
def get_pageviews(self, page):
url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{page.replace(' ','_')}/monthly/2018060100/2019060100"
r = self.session.get(url).json()
if 'items' in r:
return sum([i['views'] for i in r['items']])/365
else:
print('error',page, r)
return 'NA'
def get_content(self,page):
params = {
'action': "query",
'prop': 'revisions',
'rvprop': 'content',
'rvsection': 0,
'titles': page,
'format': "json"
}
data = self.session.get(url=self.api, params=params).json()
pageid = list(data['query']['pages'].keys())[0]
wikimarkup = data['query']['pages'][pageid]['revisions'][0]['*']
return wikimarkup.encode('utf-8','ignore').decode('unicode_escape','ignore') #not quite right
def to_csv(self):
with open(f'{self.category_cleaned}.csv','w',newline='') as w:
dw = csv.DictWriter(w,['title','category', 'ns','views','pageid']+self.extra_fields)
dw.writeheader()
dw.writerows(self.data.values())
return self
## aircraft specific
class Planewiki(Wiki):
"""
{|{{Infobox Aircraft Begin
|name=A-132 Tangará
|image =
|caption=
}}{{Infobox Aircraft Type
|type=Primary trainer
|national origin =[[Brazil]]
|manufacturer=[[Embraer#Acquisition of Aerotec|Aerotec]]
|first flight =26 February 1981
|introduction=
|retired =
|status =
|primary user=[[Bolivian Air Force]]
|number built =7
|developed from =Aerotec Uirapuru
}}'''
"""
def __init__(self, category, no_views=False, no_content=False):
super().__init__(category, no_views=no_views, no_content=no_content,
custom_page_parser=self.plane_data,
extra_fields = ['first flight','number built','unit cost','introduction','retired','introduced','status'],
forbidden_categories_keywords=['airliner','9/11','deaths','organisation','ancident','iccident','squadron', 'fiction','film','novel','star wars','star trek'])
#forbidden keywords have a title problem.
def plane_data(self, wiki):
data = {}
for key in self.extra_fields:
rex = re.search('\|'+key+'\s*\=\s*([^\n\|\<]+)',wiki)
if rex:
data[key] = rex.group(1).rstrip()
else:
data[key] = 'Error'
return data
# demo? try... Category:1980s Brazilian military aircraft
for title in ('Category:Cancelled aircraft projects','Category:Proposed_aircraft','Category:Military aircraft','Category:Civil aircraft'):
plane = Planewiki(title)
plane.get_pages_recursively()
print('******',title, len(plane.data))
plane.to_csv()
pickle.dump(plane.data, open(f'{plane.category_cleaned}.p','wb'))
military = pickle.load(open('Military_aircraft.p','rb'))
civil = pickle.load(open('Civil_aircraft.p','rb'))
def get_built(txt):
if isinstance(txt, float) or isinstance(txt, int):
return txt
elif isinstance(txt, str):
txt_clean = txt.replace(',','').replace('.','')
## case 1. there are multiple xxxx: nnn
rex = re.findall('.*?\:\s+(\d+)',txt_clean)
if rex:
n = 0
for rexi in rex:
n+=int(rexi)
return n
## case 2.
rex = re.search('(\d+)',txt_clean)
if rex:
return int(rex.group(1))
else:
return None
else:
return None
group_labels = ['Single','2-5','6-50','50-200','200-2,000','2,000+']
def groupify(nb):
x = get_built(nb)
if x == None:
return 0
elif x == 1:
return 1
elif x <=5:
return 2
elif x <=50:
return 3
elif x <=200:
return 4
elif x <=2_000:
return 5
else:
return 6
def yearify(row):
for val in (f"{row['first flight']}{row.introduced}{row.introduction}", str(row.category)):
x = re.findall('((?:19|20)\d\d)', str(val))
if x:
v = min([int(n) for n in x])
if v < 2020 and v > 1899:
return v
return None
def to_df(data):
fulldataset = pd.DataFrame.from_records(list(data.values()))
fulldataset = fulldataset.assign(built_group=fulldataset['number built'].apply(groupify))\
.assign(built_number=fulldataset['number built'].apply(get_built).values)\
.assign(views=fulldataset.views.apply(lambda x: float(x) if x != 'NA' else None))\
.assign(year= fulldataset.apply(yearify,axis=1))
ticon = lambda term: fulldataset.title.str.contains(term, regex=False, na=True)
return fulldataset.loc[(~fulldataset.built_number.isnull()) &
(fulldataset.built_group != 0) &
(~fulldataset.views.isnull()) &
(~ticon('Category:')) &
(~ticon('Squadron')) &
(~ticon('Division')) &
(~ticon('Bomber Wing')) &
(~ticon('Fighter Wing')) &
(~ticon('Group'))]
mil_df = to_df(military)
civ_df = to_df(civil)
vio_traces = []
scatter_traces = []
pivot_traces = []
import re
for side, label, dataset, color, dark_color in (("negative", "civil", civ_df, "teal","blue"), ("positive", "military", mil_df, "coral","red")):
trace = {
"type": 'violin',
"x": dataset.built_group,
"y": dataset.views,
"name": label,
"text": dataset.title,
"side": side,
"box": {
"visible": True
},
"meanline": {
"visible": True
},
'spanmode': 'hard',
"line": {
"color": color
}
}
vio_traces.append(trace)
pivot = Counter(dataset.built_group)
pivot_trace = go.Scatter(
mode= 'lines',
x= list(range(1,7)),
y= [pivot[k] for k in range(1,7)],
name= f'N in bin ({label})',
text= correct_order,
yaxis='y2',
line={'color': color}
)
pivot_traces.append(pivot_trace)
scatter_traces.append(go.Scatter(
x = dataset.built_number+np.random.random_sample(size=len(dataset.built_number))/3,
y = dataset.views,
text = dataset.title,
name = label,
mode = 'markers',
marker= {'opacity':0.2, 'color': dark_color}
))
iplot({'data': traces, #+pivot_traces
'layout':{'title': 'Plane population vs. popularity',
'yaxis':{'title':'Daily views on Wiki page'},
'yaxis2': {
'overlaying': 'y', 'range': [0,600],
'side': 'right'},
'xaxis': {'ticktext': group_labels,
'tickvals': list(range(1,1+len(group_labels)))},
"violingap": 0,
"violinmode": "overlay"
}
})
iplot({'data': traces, #+pivot_traces
'layout':{'title': 'Plane population vs. popularity',
'yaxis':{'title':'Daily views on Wiki page', 'type': 'log'},
'yaxis2': {
'overlaying': 'y', 'range': [0,600],
'side': 'right'},
'xaxis': {'ticktext': group_labels,
'tickvals': list(range(1,1+len(group_labels)))},
"violingap": 0,
"violinmode": "overlay"
}
})
iplot({'data':scatter_traces,
'layout':{'title': 'Planes: population vs. popularity','xaxis':{'title':'Numbers built', 'type': 'log'},'yaxis':{'title':'Daily views on en.wiki', 'type': 'log'}}
})
So the B1 bomber is a curious case. The infobox has two numbers, the first 5 units for the B1A, the second 100 units for the B1B. Therefore this shows how its not overly reliable...
scatter_traces = []
for label, dataset, dark_color in (("civil", civ_df, "teal"), ("military", mil_df, "coral")):
df = dataset.loc[~dataset.year.isnull()].loc[~dataset.views.isnull()]
scatter_traces.append(go.Scattergl(
x = df.year,
y = df.views,
text = dataset.title,
name = label,
mode = 'markers',
marker= {'opacity':0.2,
'color': dark_color,
'size': np.log2(df.built_number.values + 1),
'sizeref': 1.3,
'sizemin':0.2}
))
scale = np.array(range(4,13)) #73000.0
scatter_traces.append(go.Scattergl(
x = [1915+8*n for n in scale],
y = [0.3]*len(scale),
text = [str(2**n) for n in scale],
textposition='bottom center',
name = 'N built scale',
mode = 'markers+text',
marker= {'opacity':0.5,
'color': 'darkorchid',
'size': scale,
'sizeref': 1.3,
'sizemin':0.2}
))
iplot({'data':scatter_traces,
'layout':{'title': 'Planes: Numbers built, year of introduction and wikipedia pagevisits',
'xaxis':{'title':'Year of introduction'},
'yaxis':{'title':'Daily views on en.wiki', 'type': 'log'},
'shapes': [{
'type': 'rect',
'x0': 1939,
'y0': 0.1,
'x1': 1945,
'y1': 1e4,
'line': {
'width': 0,
},
'fillcolor': 'rgba(10, 10, 10, 0.1)',
},
{
'type': 'rect',
'x0': 1914,
'y0': 0.1,
'x1': 1918,
'y1': 1e4,
'line': {
'width': 0,
},
'fillcolor': 'rgba(10, 10, 10, 0.1)',
}
]}
})