#All the libararies that we will be using to complete this project 
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.io as pio
pio.renderers.default='notebook'
import folium
import requests
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import classification_report
from IPython.display import Image
import warnings
##warnings.filterwarnings("ignore", category=DtypeWarning)


#reading in data set from local machine
#dataset can be found at https://www.kaggle.com/datasets/START-UMD/gtd
#dataset cookbook can be found at https://www.start.umd.edu/gtd/downloads/Codebook.pdf
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)
    df = pd.read_csv('globalterrorismdb_0718dist.csv', encoding='ISO-8859-1')
df


#dropping unecessary columns and renaming columns for more clarity
df.drop(['approxdate', 'location', 'resolution', 'multiple',
        'scite1', 'scite2', 'scite3'], axis=1, inplace=True) #Droppping all useless columns
#Renaming columns for ease of access
df = df.rename(columns={"country": "country_id", "alternative": "alternative_id", "region": "region_id", "gname": "group_name"})
rows = df.shape[0]


#Dropping all lat and longtide rows without a value 
df = df[df['latitude'].notna()]
df = df[df['longitude'].notna()]
dropped_rows = df.shape[0]
noloc_rows = rows - dropped_rows
#print("The number of rows with no latitude/longitude informatiun is {}".format(noloc_rows))


#Checking for any null values in country and group_name columns
df['group_name'].isna().sum() #no null values for terrorism group name
df['country_txt'].isna().sum() #no null values for country

#Making sure all the year, month, and day columns have the same value, so that we don't have to worry about missing dates
df['iyear'].isna().sum()
df['imonth'].isna().sum()
df['iday'].isna().sum()

#no null values for date columns, so I can merge columns accurately

0


dtypes = df.dtypes
dtypes

#creating a date-time column
df['iday'] = df['iday'].replace(0,1)
df['imonth'] = df['imonth'].replace(0,1)
df["Date"] = df["iyear"].apply(str) + "/" + df["imonth"].apply(str) + "/" + df["iday"].apply(str)
df['Date'] =  pd.to_datetime(df['Date'])

#moving datetime column to the front of the dataframe:
date_col = df.pop("Date")
df.insert(0, date_col.name, date_col)


#quickly observing unique values of importnat columns

#df.attacktype1_txt.unique()
#df.targtype1_txt.unique()
#df.targsubtype1_txt.unique()
#df.weaptype1_txt.unique()b
#df.propextent_txt.unique()
#df.iyear.unique()
#df.imonth.unique()


df["summary"]=df["summary"].astype(str)
summary_str = " ".join(summ for summ in df.summary)
stopwords = set(STOPWORDS)
stopwords.update(["the", "and", "so", "are", "because", "at", "in", "no", "however", "nan", "near", "incident",
                 "unkown", "one"])
wordcloud = WordCloud(stopwords=stopwords, background_color="black").generate(summary_str)

plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


try:
    df["motive"]=df["motive"].astype(str)
except KeyError as ke:
    pass
    
summary_str = " ".join(summ for summ in df.motive)
stopwords = set(STOPWORDS)
stopwords.update(["nan nan", "nan", "sources speculated", "unknown", "sources posited", "Unkown", 
                  "January"])
wordcloud = WordCloud(stopwords=stopwords, background_color="black").generate(summary_str)

plt.figure(figsize=(10,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


df['country_txt'].value_counts(sort=True)[:30].plot.bar()

<AxesSubplot:>


recent_df = df.loc[df['iyear'] > 2016]
recent_df['country_txt'].value_counts(sort=True)[:30].plot.bar()

<AxesSubplot:>


#creating a new column for the count of attacks by year
df['year_count'] = df.groupby('iyear')['iyear'].transform('count')

#seaborn plot edits
sns.set_style("darkgrid")
sns.set(rc={"figure.figsize":(12,8)})
sns.set(font_scale=1.75)

#making lineplot
g = sns.lineplot(data=df, x="iyear", y="year_count")
g.set_xlabel("Year")
g.set_ylabel("Number of Terrorist Attacks")
g.set_title("Amount of Terrorist Attacks per Year")

Text(0.5, 1.0, 'Amount of Terrorist Attacks per Year')


#convertin column to string type
df["group_name"]=df["group_name"].astype(str)
#dropping terrorist group names of 'unknown'
threat_df = df.drop(df[df.group_name == "Unknown"].index)
#creating a column: "killsPerAttack" which shows the average amount of deaths per terrorist attack
threat_df['group_success'] = threat_df.groupby(['group_name','nkill'])['nkill'].transform('sum')
threat_df['group_count'] = threat_df.groupby('group_name')['group_name'].transform('count')
pd.set_option('display.max_columns', None)
threat_df["killsPerAttack"] = threat_df["group_success"]/threat_df["group_count"]

#filtering the dataframe by removing duplicate org names and taking the top 25 in sorted order
threat_plot = threat_df.drop_duplicates(subset=['group_name'], keep=False)
threat_plot = threat_plot.sort_values(by=['killsPerAttack'], ascending = False)
threat_plot = threat_plot.head(25)

#creating barchart using seaborn
g = sns.catplot(data=threat_plot, y='group_name',  x='killsPerAttack',kind='bar',
            ci=None, legend_out=True, height = 10, aspect = 1.75, orient = "h")
g.set_axis_labels("Number of Fatalities caused on average per Terrorist Attack", "Terrorist Groups/Organizations", size = 20)
plt.title("Top 25 Most Deadly Terrorist Groups and Organizations", y=1, fontsize = 25)

Text(0.5, 1, 'Top 25 Most Deadly Terrorist Groups and Organizations')


#heatmap of all terrorist attacks representing amounnt of casualties, hover over the heatmap to inspect the specific 
#terrorist organization

fig = px.density_mapbox(df, lat='latitude', lon='longitude', z='nkill', hover_name="group_name", 
                        mapbox_style="stamen-terrain", zoom=0)

fig.show("notebook")


#only using the 5 most interesting/relevant regions
regions = ['Central America & Caribbean', 'North America', 'Middle East & North Africa', 'Central Asia', 'Eastern Europe']
pie_df = df[df['region_txt'].isin(regions)]
pie_df = pie_df[pie_df['weaptype1_txt'] != "Unknown"]
pie_df['weap_count'] = pie_df.groupby(['weaptype1_txt', 'region_txt'])['weaptype1_txt'].transform('count')
pie_df = pie_df.drop_duplicates(subset=['weaptype1_txt', 'region_txt'], keep = 'last')
pie_df


pie1 = pie_df[pie_df['region_txt'] == 'Central America & Caribbean']
pie1
fig = px.pie(pie1, values='weap_count', names='weaptype1_txt', 
             title='Split of Attack Method in Central America & Caribbean')
fig.show("notebook")

pie2 = pie_df[pie_df['region_txt'] == 'North America']
fig = px.pie(pie2, values='weap_count', names='weaptype1_txt', 
             title='Split of Attack Method in North America')
fig.show("notebook")

pie3 = pie_df[pie_df['region_txt'] == 'Middle East & North Africa']
pie3
fig = px.pie(pie3, values='weap_count', names='weaptype1_txt', 
             title='Split of Attack Method in Middle East & North Africa')
fig.show("notebook")
    
pie4 = pie_df[pie_df['region_txt'] == 'Central Asia']
fig = px.pie(pie4, values='weap_count', names='weaptype1_txt', 
             title='Split of Attack Method in Central Asia')
fig.show("notebook")
    
pie5 = pie_df[pie_df['region_txt'] == 'Eastern Europe']
fig = px.pie(pie5, values='weap_count', names='weaptype1_txt', 
             title='Split of Attack Method in Eastern Europe')
fig.show("notebook")


#Making a dataframe where all attackers are known
threat_df_for_map = df[df['group_name']!= "Unknown"]
#print(threat_df_for_map)
#Making a map and adding points to it.
map_osm_for_US = folium.Map(location=[39.14, -101.2996], zoom_start=4.5)
threat_df_for_US = threat_df_for_map[threat_df_for_map["country_txt"] == "United States"]
#threat_plot = threat_plot.sort_values(by=['killsPerAttack'], ascending = False)
threat_df_for_US = threat_df_for_US.sort_values(by =["Date"],ascending = False)
#threat_df_for_US["group_name"]
aae = 0;
faln = 0;
we = 0;
lwm = 0;
for index, row in threat_df_for_US.iterrows():
    if row["group_name"] == "Anti-Abortion extremists":
        if aae == 50:
            continue
        else: 
            aae = aae + 1
            folium.Marker(location=[row["latitude"], row["longitude"]],
                    popup = row["group_name"] + " " + row["Date"].strftime("%m/%d/%Y"),
                    icon=folium.Icon(color='red')).add_to(map_osm_for_US)
        

            
    if row["group_name"] == "Left-Wing Militants":
        if lwm == 50:
            continue
        else: 
            lwm = lwm + 1
            folium.Marker(location=[row["latitude"], row["longitude"]],
                    popup = row["group_name"] + " " + row["Date"].strftime("%m/%d/%Y") ,
                    icon=folium.Icon(color='blue')).add_to(map_osm_for_US)
            
    if row["group_name"] == "Fuerzas Armadas de Liberacion Nacional (FALN)":
        if faln == 50:
            continue
        else: 
            faln = faln + 1
            folium.Marker(location=[row["latitude"], row["longitude"]],
                    popup = row["group_name"] + " " + row["Date"].strftime("%m/%d/%Y") ,
                    icon=folium.Icon(color='gray')).add_to(map_osm_for_US)
            
    if row["group_name"] == "White extremists":
        if we == 50:
            continue
        else: 
            we = we + 1
            folium.Marker(location=[row["latitude"], row["longitude"]],
                    popup = row["group_name"] + " " + row["Date"].strftime("%m/%d/%Y") ,
                    icon=folium.Icon(color='purple')).add_to(map_osm_for_US)
            
        
map_osm_for_US


#Getting a dataframe that only has the group name and how many occurrences they have
group_counts_for_US = threat_df_for_US['group_name'].value_counts().reset_index()
group_counts_for_US.columns = ['group_name', 'count']
#print(group_counts_for_US)
#Getting the top ten groups in the Us, and printing only those
group_counts_for_US.head(10)
group_counts_for_US = group_counts_for_US[group_counts_for_US['count'] >= 66]
#df.loc[row_index] = df.loc[row_index].rename('new_index_name')
#Renaming group names so that they fit better on the bar chart
group_counts_for_US.at[2, 'group_name'] = 'FALN'
group_counts_for_US.at[4, 'group_name'] = 'NWLF'
group_counts_for_US.at[6, 'group_name'] = 'ALF'
group_counts_for_US.at[7, 'group_name'] = 'JDL'
group_counts_for_US.at[9, 'group_name'] = 'ELF'
group_counts_for_US.at[0, 'group_name'] = 'Anti-Abortion'
group_counts_for_US.at[1, 'group_name'] = 'Left-Wing'
#print(group_counts
#Making the graph for the US
sns.set(rc={"figure.figsize":(14, 12)})
g = sns.barplot(data=group_counts_for_US, x="group_name", y='count')
g.set_xlabel("Group Name", fontsize = 20)
g.set_ylabel("Number of Attacks", fontsize = 20)
g.set_title("Number of Attacks by Terrorist Group in the United States", fontsize = 30)

Text(0.5, 1.0, 'Number of Attacks by Terrorist Group in the United States')


threat_df_for_Iraq = threat_df_for_map[threat_df_for_map["country_txt"] == "Iraq"]
group_counts_for_Iraq = threat_df_for_Iraq['group_name'].value_counts().reset_index()
group_counts_for_Iraq.columns = ['group_name', 'count']
group_counts_for_Iraq = group_counts_for_Iraq[group_counts_for_Iraq['count'] >= 20]
group_counts_for_Iraq
group_counts_for_Iraq.at[0,"group_name"] = 'ISIL'
group_counts_for_Iraq.at[1,"group_name"] = 'Al-Qaida'
group_counts_for_Iraq.at[2,"group_name"] = 'ISI'
group_counts_for_Iraq.at[5,"group_name"] = 'T&J'
group_counts_for_Iraq.at[6,"group_name"] = 'JRTN'
group_counts_for_Iraq.at[5,"group_name"] = "JTJ"
group_counts_for_Iraq.at[7,"group_name"] = 'Muslim Ex.'
group_counts_for_Iraq.at[9,"group_name"] = 'MCTR'

group_counts_for_Iraq
g = sns.barplot(data=group_counts_for_Iraq, x="group_name", y='count')
g.set_xlabel("Group Name", fontsize = 20)
g.set_ylabel("Number of Attacks", fontsize = 20)
g.set_title("Number of Attacks by Terrorist Group in Iraq", fontsize = 30)

Text(0.5, 1.0, 'Number of Attacks by Terrorist Group in Iraq')


#Tyring to predict the terrorist organization based on selected predictor columns

ml_df = df[df.group_name != 'Unknown']
enc=OneHotEncoder()
pd.set_option('display.max_columns', 50)

#Making the columns categorical and removing any unknowns from the 'ihostkid' and 'intany' columns
df['specificity'] = pd.Categorical(df.specificity)
df['vicinity'] = pd.Categorical(df.specificity)
df['success'] = pd.Categorical(df.success)
ml_df['ishostkid'].replace('-9','0')
ml_df['INT_ANY'].replace('-9','0')

#choosing dependent variable columns
enc_data=pd.DataFrame(enc.fit_transform(ml_df[['extended','country_txt', 'specificity', 'vicinity', 'crit1',
    'crit2','crit3','success','suicide','attacktype1_txt','targtype1_txt', 'guncertain1','weaptype1_txt',
    'property','ishostkid','ransom','INT_ANY']]).toarray())

X = enc_data
y = ml_df["group_name"] #target variable

#training the model
#The max depth determines how deep each tree in the random forest will go down to before it must make a conclusion 
# The number of estimators is the number of decision trees that the classifier will make
SEED = 99
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
rfc = RandomForestClassifier(n_estimators=100, max_depth=9,random_state=SEED)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

#evaluation metrics

final_df = pd.DataFrame(classification_report(y_test,y_pred, zero_division = 0,output_dict = True))
final_df = final_df.drop(final_df.index[-1])
final_df.head(50)


#Predicting whether the terrorist attack was a success or not

ml1_df=df
ml1_df = df[df.weaptype1_txt != 'Unknown']
pd.set_option('display.max_columns', 50)
ml1_df['ishostkid'].replace('-9','0')
ml1_df['INT_ANY'].replace('-9','0')

#choosing dependent variable columns
enc_data = ml1_df[['success','weaptype1_txt','region_txt',
                   'attacktype1_txt','targtype1_txt', 'guncertain1', 'specificity']]
#one-hot encoding
df_dummies = pd.get_dummies(data=enc_data, columns=enc_data.columns[1:])
y = df_dummies['success']
X = df_dummies.drop(['success'], axis = 1)

#training the model
SEED = 99
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=SEED)
rfc = RandomForestClassifier(n_estimators=5, max_depth=9,random_state=SEED)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

#evaluation metrics
print(classification_report(y_test,y_pred))

#10-fold cross validation
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(rfc, X, y,cv=cv, n_jobs=-1)
print(scores)

              precision    recall  f1-score   support

           0       0.72      0.24      0.35      2681
           1       0.91      0.99      0.95     21745

    accuracy                           0.91     24426
   macro avg       0.82      0.61      0.65     24426
weighted avg       0.89      0.91      0.88     24426

[0.8940678  0.90770081 0.89670843 0.90407762 0.90241356 0.90241356
 0.90339618 0.89940429 0.89756187 0.89246453]

	eventid	iyear	imonth	iday	approxdate	extended	resolution	country	country_txt	region	region_txt	provstate	city	latitude	longitude	specificity	vicinity	location	summary	crit1	crit2	crit3	doubtterr	alternative	alternative_txt	...	nhostkid	nhostkidus	nhours	ndays	divert	kidhijcountry	ransom	ransomamt	ransomamtus	ransompaid	ransompaidus	ransomnote	hostkidoutcome	hostkidoutcome_txt	nreleased	addnotes	scite1	scite2	scite3	dbsource	INT_LOG	INT_IDEO	INT_MISC	INT_ANY	related
0	197000000001	1970	7	2	NaN	0	NaN	58	Dominican Republic	2	Central America & Caribbean	NaN	Santo Domingo	18.456792	-69.951164	1.0	0	NaN	NaN	1	1	1	0.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	PGIS	0	0	0	0	NaN
1	197000000002	1970	0	0	NaN	0	NaN	130	Mexico	1	North America	Federal	Mexico city	19.371887	-99.086624	1.0	0	NaN	NaN	1	1	1	0.0	NaN	NaN	...	1.0	0.0	NaN	NaN	NaN	Mexico	1.0	800000.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	PGIS	0	1	1	1	NaN
2	197001000001	1970	1	0	NaN	0	NaN	160	Philippines	5	Southeast Asia	Tarlac	Unknown	15.478598	120.599741	4.0	0	NaN	NaN	1	1	1	0.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	PGIS	-9	-9	1	1	NaN
3	197001000002	1970	1	0	NaN	0	NaN	78	Greece	8	Western Europe	Attica	Athens	37.997490	23.762728	1.0	0	NaN	NaN	1	1	1	0.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	PGIS	-9	-9	1	1	NaN
4	197001000003	1970	1	0	NaN	0	NaN	101	Japan	4	East Asia	Fukouka	Fukouka	33.580412	130.396361	1.0	0	NaN	NaN	1	1	1	-9.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	0.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	PGIS	-9	-9	1	1	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
181686	201712310022	2017	12	31	NaN	0	NaN	182	Somalia	11	Sub-Saharan Africa	Middle Shebelle	Ceelka Geelow	2.359673	45.385034	2.0	0	The incident occurred near the town of Balcad.	12/31/2017: Assailants opened fire on a Somali...	1	1	0	1.0	1.0	Insurgency/Guerilla Action	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	"Somalia: Al-Shabaab Militants Attack Army Che...	"Highlights: Somalia Daily Media Highlights 2 ...	"Highlights: Somalia Daily Media Highlights 1 ...	START Primary Collection	0	0	0	0	NaN
181687	201712310029	2017	12	31	NaN	0	NaN	200	Syria	10	Middle East & North Africa	Lattakia	Jableh	35.407278	35.942679	1.0	1	The incident occurred at the Humaymim Airport.	12/31/2017: Assailants launched mortars at the...	1	1	0	1.0	1.0	Insurgency/Guerilla Action	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	"Putin's 'victory' in Syria has turned into a ...	"Two Russian soldiers killed at Hmeymim base i...	"Two Russian servicemen killed in Syria mortar...	START Primary Collection	-9	-9	1	1	NaN
181688	201712310030	2017	12	31	NaN	0	NaN	160	Philippines	5	Southeast Asia	Maguindanao	Kubentog	6.900742	124.437908	2.0	0	The incident occurred in the Datu Hoffer distr...	12/31/2017: Assailants set fire to houses in K...	1	1	1	0.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	"Maguindanao clashes trap tribe members," Phil...	NaN	NaN	START Primary Collection	0	0	0	0	NaN
181689	201712310031	2017	12	31	NaN	0	NaN	92	India	6	South Asia	Manipur	Imphal	24.798346	93.940430	1.0	0	The incident occurred in the Mantripukhri neig...	12/31/2017: Assailants threw a grenade at a Fo...	1	1	1	0.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	"Trader escapes grenade attack in Imphal," Bus...	NaN	NaN	START Primary Collection	-9	-9	0	-9	NaN
181690	201712310032	2017	12	31	NaN	0	NaN	160	Philippines	5	Southeast Asia	Maguindanao	Cotabato City	7.209594	124.241966	1.0	0	NaN	12/31/2017: An explosive device was discovered...	1	1	1	0.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	"Security tightened in Cotabato following IED ...	"Security tightened in Cotabato City," Manila ...	NaN	START Primary Collection	-9	-9	0	-9	NaN

	Abu Sayyaf Group (ASG)	...	accuracy	macro avg	weighted avg
precision	0.818182	...	0.568792	0.030449	0.417401
recall	0.088235	...	0.568792	0.026104	0.568792
f1-score	0.159292	...	0.568792	0.024629	0.448185

CMSC320 Final Project: Analyzing the Global Terrorism Database¶

By Omeed Zarrabian and Adithya Raj¶

Introduction ¶

Explanation ¶

Reading and Cleaning the Data¶

Data Cleaning¶

Data Exploration and Visualization ¶

Wordclouds¶

Graphing¶

Top 30 countries with the most terrorist attacks since 1970:¶

Top 30 countries with the most terrorist between 2016-2017 (last 2 years in dataset):¶

Graphic Terrorist Attacks per Year¶

Graphing the Most Deadly Terrorist Organizations¶

Weapon Usage¶

Some More (Localized) Information¶

Comparing Groups in the United States vs Groups in Iraq¶

Producing our Machine Learning Model(s) ¶

First Attempt (Predicting which group was responsible for an attack)¶

Second Attempt at a Model (Predicting whether or not an attack was successful)¶

Conclusion and Closing Thoughts ¶