In [301]:
import sys
assert sys.version_info.major == 3 and sys.version_info.minor == 7, 'Sorry. 3.7 or 3.8 only. f-strings.'
In [307]:
%%javascript
//hack to fix export
require.config({
  paths: {
    d3: 'https://cdnjs.cloudflare.com/ajax/libs/d3/5.9.2/d3',
    jquery: 'https://code.jquery.com/jquery-3.4.1.min',
    plotly: 'https://cdn.plot.ly/plotly-latest.min'
  },

  shim: {
    plotly: {
      deps: ['d3', 'jquery'],
      exports: 'plotly'
    }
  }
});
In [26]:
import requests, re, csv, pickle
import pandas as pd
import numpy as np
import pickle
import re

from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode()
In [24]:
class Wiki():
    api = "https://en.wikipedia.org/w/api.php"
    
    def __init__(self, category, 
                 no_views=False, 
                 no_content=False, 
                 custom_page_parser=None, 
                 extra_fields=None,
                 forbidden_categories_keywords=None):
        """
        custom_page_parser is for content mining. a function that given wiki text returns a dictionary of whatever it mined.
        Any extra fields need to be be added to extra_fields or to_csv will fail.
        """
        self.session = requests.Session()
        self.no_views = no_views
        self.no_content = no_content
        self.data = {}
        self.category = category
        self.category_cleaned = category.replace(' ','_').replace('Category:','')
        if custom_page_parser:
            self.custom_page_parser = custom_page_parser
        else:
            self.custom_page_parser = lambda text: {}
        if extra_fields:
            self.extra_fields = extra_fields
        else:
            self.extra_fields = []
        if forbidden_categories_keywords:
            self.forbidden_categories_keywords = forbidden_categories_keywords
        else:
            self.forbidden_categories_keywords = []
        
    def get(self, params):
        data = self.session.get(url=self.api, params=params).json()
        if 'continue' in data:
            params['cmcontinue'] = data['continue']['cmcontinue']
            t = list(data['query'].keys())[0]
            new_data = self.get(params)
            new_data['query'][t] = [*data['query'][t], *new_data['query'][t]]
            data = new_data
        return data
    
    def add_datum(self, data, cat):
        for d in data:
            name = d["title"]
            if name not in self.data:
                self.data[name] = d
                self.data[name]['category'] = cat
                if not self.no_views:
                    self.data[name]['views'] = self.get_pageviews(name)
                if not self.no_content:
                    wiki = self.get_content(name)
                    for key, value in self.custom_page_parser(wiki).items():
                        self.data[name][key] = value
            else:
                self.data[name]["category"] += '|' + cat
        
    def get_subcategories(self, cat):
        return [subcat for subcat in self.get_members(cat, 'subcat')
                       if all([k not in subcat['title'].lower() for k in self.forbidden_categories_keywords])]
    
    def get_pages(self,cat):
        return self.get_members(cat, 'page')
    
    def get_members(self, cat, cmtype='subcat|page'):
        params = {
            'action': "query",
            'list': "categorymembers",
            'cmtitle': cat,
            'cmtype': cmtype,
            'cmdir': "desc",
            'format': "json"
            }
        r = self.get(params)
        if 'query' not in r:
            print(f'{cat} replied with {str(r)}.')
            return []
        data = r['query']['categorymembers']
        self.add_datum(data, cat)
        return data
    
    def get_pages_recursively(self, cat=None):
        if cat is None:
            cat = self.category
        subcats = [s['title'] for s in self.get_subcategories(cat)]
        data = self.get_pages(cat)
        for c in subcats:
            ndata = self.get_pages_recursively(c)
            print(c, len(data),len(ndata))
            data.extend(ndata)
        return data
    
    def get_pageviews(self, page):
        url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{page.replace(' ','_')}/monthly/2018060100/2019060100"
        r = self.session.get(url).json()
        if 'items' in r:
            return sum([i['views'] for i in r['items']])/365
        else:
            print('error',page, r)
            return 'NA'
        
    def get_content(self,page):
        params = {
            'action': "query",
            'prop': 'revisions',
            'rvprop': 'content',
            'rvsection': 0,
            'titles': page,
            'format': "json"
            }
        data = self.session.get(url=self.api, params=params).json()
        pageid = list(data['query']['pages'].keys())[0]
        wikimarkup = data['query']['pages'][pageid]['revisions'][0]['*']
        return wikimarkup.encode('utf-8','ignore').decode('unicode_escape','ignore') #not quite right
    
    def to_csv(self):
        with open(f'{self.category_cleaned}.csv','w',newline='') as w:
            dw = csv.DictWriter(w,['title','category', 'ns','views','pageid']+self.extra_fields)
            dw.writeheader()
            dw.writerows(self.data.values())
        return self

## aircraft specific
class Planewiki(Wiki):
    """
    {|{{Infobox Aircraft Begin
     |name=A-132 Tangará
     |image =
     |caption=
    }}{{Infobox Aircraft Type
     |type=Primary trainer
     |national origin =[[Brazil]]
     |manufacturer=[[Embraer#Acquisition of Aerotec|Aerotec]]
     |first flight =26 February 1981
     |introduction=
     |retired =
     |status =
     |primary user=[[Bolivian Air Force]]
     |number built =7
     |developed from =Aerotec Uirapuru
    }}'''
    """
    
    def __init__(self, category, no_views=False, no_content=False):
        super().__init__(category, no_views=no_views, no_content=no_content,
                         custom_page_parser=self.plane_data,
                         extra_fields = ['first flight','number built','unit cost','introduction','retired','introduced','status'],
                         forbidden_categories_keywords=['airliner','9/11','deaths','organisation','ancident','iccident','squadron', 'fiction','film','novel','star wars','star trek'])
        #forbidden keywords have a title problem.
        

    def plane_data(self, wiki):
        data = {}
        for key in self.extra_fields:
            rex = re.search('\|'+key+'\s*\=\s*([^\n\|\<]+)',wiki)
            if rex:
                data[key] = rex.group(1).rstrip()
            else:
                data[key] = 'Error'
        return data
In [ ]:
# demo? try... Category:1980s Brazilian military aircraft
for title in ('Category:Cancelled aircraft projects','Category:Proposed_aircraft','Category:Military aircraft','Category:Civil aircraft'):
    plane = Planewiki(title)
    plane.get_pages_recursively()
    print('******',title, len(plane.data))
    plane.to_csv()
    pickle.dump(plane.data, open(f'{plane.category_cleaned}.p','wb'))
In [28]:
military = pickle.load(open('Military_aircraft.p','rb'))
civil = pickle.load(open('Civil_aircraft.p','rb'))
In [254]:
def get_built(txt):
    if isinstance(txt, float) or isinstance(txt, int):
        return txt
    elif isinstance(txt, str):
        txt_clean = txt.replace(',','').replace('.','')
        ## case 1. there are multiple xxxx: nnn
        rex = re.findall('.*?\:\s+(\d+)',txt_clean)
        if rex:
            n = 0
            for rexi in rex:
                n+=int(rexi)
            return n
        ## case 2.
        rex = re.search('(\d+)',txt_clean)
        if rex:
            return int(rex.group(1))
        else:
            return None
    else:
        return None

group_labels = ['Single','2-5','6-50','50-200','200-2,000','2,000+']
def groupify(nb):
    x = get_built(nb)
    if x == None:
        return 0
    elif x == 1:
        return 1
    elif x <=5:
        return 2
    elif x <=50:
        return 3
    elif x <=200:
        return 4
    elif x <=2_000:
        return 5
    else:
        return 6

def yearify(row):
    for val in (f"{row['first flight']}{row.introduced}{row.introduction}", str(row.category)):
        x = re.findall('((?:19|20)\d\d)', str(val))
        if x:
            v = min([int(n) for n in x])
            if v < 2020 and v > 1899:
                return v
    return None


def to_df(data):
    fulldataset = pd.DataFrame.from_records(list(data.values()))
    
    fulldataset = fulldataset.assign(built_group=fulldataset['number built'].apply(groupify))\
                             .assign(built_number=fulldataset['number built'].apply(get_built).values)\
                             .assign(views=fulldataset.views.apply(lambda x: float(x) if x != 'NA' else None))\
                             .assign(year= fulldataset.apply(yearify,axis=1))  
    
    ticon = lambda term: fulldataset.title.str.contains(term, regex=False, na=True)
    return fulldataset.loc[(~fulldataset.built_number.isnull()) &
                          (fulldataset.built_group != 0) & 
                          (~fulldataset.views.isnull()) &
                          (~ticon('Category:')) & 
                          (~ticon('Squadron')) & 
                          (~ticon('Division')) & 
                          (~ticon('Bomber Wing')) & 
                          (~ticon('Fighter Wing')) & 
                          (~ticon('Group'))]
mil_df = to_df(military)
civ_df = to_df(civil)
In [187]:
vio_traces = []
scatter_traces = []
pivot_traces = []
import re

for side, label, dataset, color, dark_color in (("negative", "civil", civ_df, "teal","blue"), ("positive", "military", mil_df, "coral","red")):    
    trace = {
                "type": 'violin',
                "x": dataset.built_group,
                "y": dataset.views,
                "name": label,
                "text": dataset.title,
                "side": side,
                "box": {
                    "visible": True
                },
                "meanline": {
                    "visible": True
                },
                'spanmode': 'hard',
                "line": {
                    "color": color
                }
            }
    vio_traces.append(trace)
    
    pivot = Counter(dataset.built_group)
    pivot_trace = go.Scatter(
                mode= 'lines',
                x= list(range(1,7)),
                y= [pivot[k] for k in range(1,7)],
                name= f'N in bin ({label})',
                text= correct_order,
                yaxis='y2',
                line={'color': color}
            )
    pivot_traces.append(pivot_trace)
    
    scatter_traces.append(go.Scatter(
            x = dataset.built_number+np.random.random_sample(size=len(dataset.built_number))/3,
            y = dataset.views,
            text = dataset.title,
            name = label,
            mode = 'markers',
            marker= {'opacity':0.2, 'color': dark_color}
            ))

iplot({'data': traces, #+pivot_traces
       'layout':{'title': 'Plane population vs. popularity',
                 'yaxis':{'title':'Daily views on Wiki page'},
                 'yaxis2': {
                    'overlaying': 'y', 'range': [0,600],
                    'side': 'right'},
                 'xaxis': {'ticktext': group_labels,
                 'tickvals': list(range(1,1+len(group_labels)))},
                 "violingap": 0,
                 "violinmode": "overlay"
                }
      })                
        
iplot({'data': traces, #+pivot_traces
       'layout':{'title': 'Plane population vs. popularity',
                 'yaxis':{'title':'Daily views on Wiki page', 'type': 'log'},
                 'yaxis2': {
                    'overlaying': 'y', 'range': [0,600],
                    'side': 'right'},
                 'xaxis': {'ticktext': group_labels,
                 'tickvals': list(range(1,1+len(group_labels)))},
                 "violingap": 0,
                 "violinmode": "overlay"
                }
      })  


iplot({'data':scatter_traces,
       'layout':{'title': 'Planes: population vs. popularity','xaxis':{'title':'Numbers built', 'type': 'log'},'yaxis':{'title':'Daily views on en.wiki', 'type': 'log'}}
      })

So the B1 bomber is a curious case. The infobox has two numbers, the first 5 units for the B1A, the second 100 units for the B1B. Therefore this shows how its not overly reliable...

In [305]:
scatter_traces = []
for label, dataset, dark_color in (("civil", civ_df, "teal"), ("military", mil_df, "coral")):
    df = dataset.loc[~dataset.year.isnull()].loc[~dataset.views.isnull()]
    scatter_traces.append(go.Scattergl(
                x = df.year,
                y = df.views,
                text = dataset.title,
                name = label,
                mode = 'markers',
                marker= {'opacity':0.2,
                         'color': dark_color,
                        'size': np.log2(df.built_number.values + 1),
                         'sizeref': 1.3,
                        'sizemin':0.2}
                ))
    
    
scale = np.array(range(4,13)) #73000.0
scatter_traces.append(go.Scattergl(
                            x = [1915+8*n for n in scale],
                            y = [0.3]*len(scale),
                            text = [str(2**n) for n in scale],
                            textposition='bottom center',
                            name = 'N built scale',
                            mode = 'markers+text',
                            marker= {'opacity':0.5,
                                     'color': 'darkorchid',
                                    'size': scale,
                                     'sizeref': 1.3,
                                    'sizemin':0.2}
                            ))   


iplot({'data':scatter_traces,
       'layout':{'title': 'Planes: Numbers built, year of introduction and wikipedia pagevisits',
                 'xaxis':{'title':'Year of introduction'},
                 'yaxis':{'title':'Daily views on en.wiki', 'type': 'log'},
                'shapes': [{
                            'type': 'rect',
                            'x0': 1939,
                            'y0': 0.1,
                            'x1': 1945,
                            'y1': 1e4,
                            'line': {
                            'width': 0,
                            },
                            'fillcolor': 'rgba(10, 10, 10, 0.1)',
                            },
                            {
                            'type': 'rect',
                            'x0': 1914,
                            'y0': 0.1,
                            'x1': 1918,
                            'y1': 1e4,
                            'line': {
                            'width': 0,
                            },
                            'fillcolor': 'rgba(10, 10, 10, 0.1)',
                            }
                          ]}
      })