| import pandas as pd |
|
|
| enterpriseGroups = ['facialRecognition',['safetySecurity','recruitment','biometricData']] |
| societyGroups = [['policing','controlAccessToServices']] |
| dataTypeGroups = [['dataTypePersonal','dataTypeSensistivePersonal'],['dataTypeRestricted']] |
| capabilitiesGroups = ['decisionSupportSystems'] |
|
|
| technicalRisks = ['Robustness', 'Efficacy', |
| 'Privacy', 'Bias', 'Explainability'] |
| governanceRisks = ['Financial', 'Reputational', 'Ethics', 'Regulation'] |
| riskVerticals = ['Overall'] + governanceRisks + technicalRisks |
|
|
|
|
| def mergeCostColumns(home, commisioned, licensed): |
| if home == 'insignificant' or commisioned == 'insignificant' or licensed == 'insignificant': |
| output = 1 |
| else: |
| output = 0 |
| return output |
|
|
|
|
| def generateUniqueEntries(targetColumn): |
| listOfEntries = [] |
| for i in targetColumn.values: |
| listOfEntries += i.split(',') |
| listOfEntries = set(listOfEntries) |
| return list(listOfEntries) |
|
|
|
|
| def generateOneHot(dataframe, targetColumn, groups): |
| for group in groups: |
| groupColumnName = '' |
| if type(group) == str: |
| groupColumnName = targetColumn + '_' + group |
| else: |
| for element in group: |
| if groupColumnName == '': |
| groupColumnName += targetColumn + '_' + element |
| else: |
| groupColumnName += '_' + element |
| dataframe[groupColumnName] = 0 |
|
|
| for i, targetColumnData in enumerate(dataframe[targetColumn].values): |
| if type(group) == str: |
| if group in targetColumnData.split(','): |
| dataframe.loc[ |
| i, groupColumnName] = 1 |
| else: |
| for element in group: |
| if element in targetColumnData.split(','): |
| dataframe.loc[ |
| i, groupColumnName] = 1 |
|
|
| dataframe.insert(0, groupColumnName, dataframe.pop(groupColumnName)) |
|
|
| dataframe.pop(targetColumn) |
|
|
|
|
| def convertToBinaryColumn(dataframe, targetColumn, |
| positiveGroup): |
| for i, targetColumnData in enumerate(dataframe[targetColumn].values): |
| if targetColumnData in positiveGroup: |
| dataframe.loc[i, targetColumn] = 1 |
| else: |
| dataframe.loc[i, targetColumn] = 0 |
|
|
|
|
| def mainDataWrangling(data): |
| |
| columnsToKeep = [1, 4, 5, 6, 7, 8, 10, 22, 24, 34, 35, 36, 37, 39, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, |
| 57] |
| data = data.iloc[:, columnsToKeep] |
|
|
| |
| data['insignificant'] = data.apply( |
| lambda x: mergeCostColumns(x['homeBuiltAmount'], x['commisionedAmount'], x['licensedAmount']), axis=1) |
| data.drop(data.iloc[:, 1:4], axis=1, inplace=True) |
|
|
| |
| data.iloc[ |
| 0, -1] = 'insignificant' |
| data.iloc[0, 0] = 'projectName' |
| data.iloc[0, -11:-1] = data.columns[-11:-1].values |
| data.columns = data.iloc[0, :].values |
| data = data.drop([0]) |
| data.reset_index(drop=True, inplace=True) |
| data.insert(0, 'insignificant', data.pop('insignificant')) |
|
|
| |
| data = data.dropna( |
| subset=data.columns[-10:].values) |
| data.reset_index(drop=True, inplace=True) |
| data['howEssentialHumanInTheLoop'].fillna('low', inplace=True) |
|
|
| |
| generateOneHot(data, 'enterpriseUseCases', enterpriseGroups) |
| generateOneHot(data, 'soceityLevel', societyGroups) |
| convertToBinaryColumn(data, 'externalParties', ['yes']) |
| data['howWidelyDeployed'] = data['howWidelyDeployed'].map( |
| {'controlledEnvironment': 0, 'local': 0.2, 'multipleJurisdictions': 0.5, 'global': 1}) |
| generateOneHot(data, 'dataType', dataTypeGroups) |
| data['autonomy'] = data['autonomy'].map({'humanInTheLoop': 0, 'autonomous': 1}) |
| data['howEssentialHumanInTheLoop'] = data['howEssentialHumanInTheLoop'].map({'low': 0, 'medium': 0.5, 'high': 1}) |
| data['damageCausedIfSubstantialFailure'] = data['damageCausedIfSubstantialFailure'].map( |
| {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1}) |
| data['damageCausedIfMarginalFailure'] = data['damageCausedIfMarginalFailure'].map( |
| {'none': 0, 'minor': 0.3, 'major': 0.7, 'critical': 1}) |
| generateOneHot(data, 'capabilities', capabilitiesGroups) |
| data['selfAdapting'] = data['selfAdapting'].map({'no': 0, 'yesWhenUpdatedMade': 0.5, 'yesRealTime': 1}) |
|
|
| |
| |
| for riskColumn in data.columns[-10:]: |
| data[riskColumn + '_binaryLow'] = data[riskColumn].map({'Low': 1, 'Medium': 0, 'High': 0}) |
| |
| for riskColumn in data.columns[-20:-10]: |
| data[riskColumn + '_binaryHigh'] = data[riskColumn].map({'Low': 0, 'Medium': 0, 'High': 1}) |
|
|
| data.insert(0, 'projectName', data.pop('projectName')) |
| data['insignificant'] = pd.to_numeric(data['insignificant']) |
| data['externalParties'] = pd.to_numeric(data['externalParties']) |
|
|
| return data |