Pylint changes and new data structure parser (3bdaa2a1) · Commits · CYBERSEC / ELK LA / ELK-learning-analytics

.gitignore

+1 −0

Original line number	Diff line number	Diff line
		@@ -3,3 +3,4 @@
		.idea
		Pipfile.lock
		.ipynb_checkpoints/*
		__pycache__/*

pycache/kypo_elk_scripts_utils.cpython-39.pyc

+11 B (1.59 KiB)

File changed.

No diff preview for this file type.

View original file

View changed file

elasticsearch_connection.py

deleted100644 → 0

+0 −111

Original line number	Diff line number	Diff line
		import pandas as pd
		import numpy as np
		from elasticsearch import Elasticsearch
		import re


		class ElasticSearchConnection:

		def __init__(self, host, port):
		self.es = Elasticsearch(HOST=host, PORT=port)
		self.sandbox_ids = []
		self.time_from = None
		self.time_to = None

		def set_sandbox_id(self, sb_id):
		self.sandbox_ids = sb_id

		def set_time_from(self, time_from):
		self.time_from = time_from

		def set_time_to(self, time_to):
		self.time_to = time_to

		def fetch_bash_logs(self):
		path_bash = ','.join(list(map(lambda x: 'kypo.logs.console.bash.command.pool=1.sandbox=' + x ,self.sandbox_ids)))
		resp_bash = self.elasticsearch_search('bash logs', path_bash, self.build_date_range_query('timestamp_str'))
		return self.transform_bash_logs_to_df(resp_bash)


		def fetch_bash_logs_raw(self):
		path_bash = ','.join(list(map(lambda x: 'kypo.logs.console.bash.command.pool=1.sandbox=' + x ,self.sandbox_ids)))
		resp_bash = self.elasticsearch_search('bash logs', path_bash, self.build_date_range_query('timestamp_str'))
		return resp_bash

		def sniff_bash_logs(self):
		path_bash = 'kypo.logs.console.bash.command.pool=1.sandbox=*'
		resp_bash = self.elasticsearch_search('bash logs', path_bash, { 'size': 9999 })
		bash_df = self.transform_bash_logs_to_df(resp_bash)
		return bash_df.groupby(['sandbox_id']).agg({ 'timestamp': [np.min, np.max], 'cmd_raw': ['count'] })


		def fetch_bash_logs_raw(self):
		path_bash = ','.join(list(map(lambda x: 'kypo.logs.console.bash.command.pool=1.sandbox=' + x ,self.sandbox_ids)))
		resp_bash = self.elasticsearch_search('bash logs', path_bash, self.build_date_range_query('timestamp_str'))
		return resp_bash

		def sniff_events(self):
		path_events = 'kypo._evt..sandbox=.definition=.instance=.run='
		resp_events = self.elasticsearch_search('events', path_events, { 'size': 9999 })
		events_df = self.transform_events_to_df(resp_events)
		return events_df.groupby(['sandbox_id']).agg({ 'timestamp': [np.min, np.max], 'event_type': ['count'] })

		def fetch_events(self):
		path_events = ','.join(list(map(lambda x: 'kypo._evt..sandbox=' + x + '.definition=.instance=.run=*',self.sandbox_ids)))
		resp_events = self.elasticsearch_search('events', path_events, self.build_date_range_query('syslog.@timestamp'))
		return self.transform_events_to_df(resp_events)

		def elasticsearch_search(self, call_name, index, body):
		resp = self.es.search(index=index, body=body, ignore=[404])
		if 'error' in resp:
		print('[!] Request problem in', call_name, '-', self.resolve_elastic_error(resp['error']['type']))
		return []
		else:
		return resp['hits']['hits']

		def build_date_range_query(self, timestamp_field):
		# This query is able to take data in defined range. More information here:
		# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
		return { 'size': 9999, 'query': { 'range': { timestamp_field: { 'gte': self.time_from + '+02:00', 'lte': self.time_to + '+02:00' } } } }


		def transform_bash_logs_to_df(self, bash_logs):
		cols = {'sandbox_id': [], 'timestamp': [], 'cmd_raw': [], 'cmd_type': []}
		for log in bash_logs:
		cols['sandbox_id'].append(re.search('sandbox=(\d*)', log['_index']).group(1))
		cols['timestamp'].append(log['_source']['timestamp_str'])
		cols['cmd_raw'].append(log['_source']['cmd'])
		cols['cmd_type'].append(log['_source']['cmd_type'])

		df = pd.DataFrame(cols)
		df['sandbox_id'] = df['sandbox_id'].astype('int')
		df['timestamp'] = df['timestamp'].apply(pd.to_datetime)
		df['cmd_type'] = df['cmd_type'].astype('category')
		df['cmd_raw'] = df['cmd_raw'].astype('string')
		return df


		def transform_events_to_df(self, events):
		cols = {'sandbox_id': [], 'timestamp': [], 'event_type': [], 'level': []}
		for event in events:
		cols['sandbox_id'].append(event['_source']['sandbox_id'])
		cols['timestamp'].append(event['_source']['syslog']['@timestamp'])
		cols['event_type'].append(event['_source']['type'].split('.')[-1])
		cols['level'].append(event['_source']['level'])

		df = pd.DataFrame(cols)
		df['sandbox_id'] = df['sandbox_id'].astype('int')
		df['timestamp'] = df['timestamp'].apply(pd.to_datetime)
		df['event_type'] = df['event_type'].astype('category')
		df['level'] = df['level'].astype('int')
		return df

		def resolve_elastic_error(self, error):
		# Resolving known errors to make script more user friendly
		error_switch = {
		'index_not_found_exception': 'Sandbox given on input does not exist'
		}
		if error in error_switch:
		return error_switch[error]
		else:
		return error

elk_scripts.ipynb

+112 −10

Original line number	Diff line number	Diff line
		%% Cell type:markdown id:175dd1a2 tags:

		# KYPO Bash Logs Analysis scripts

		These scripts help to get fast knowledge about data from KYPO Training or clean them up.

		Each script contains its definition with example output, the cell with code, and following cell the example usage. Almost every script offers two functions. One function with analysis implementation usually returns result in Pandas `DataFrame` or some other data structure. The next function with the `show_` prefix sends formatted output to the console.


		#### Prequisite
		Scripts are dependent on module kypo_elk_scripts_utils and elasticsearch_connection. Be sure these modules are in the same folder, and the notebook is not used independently.

		%% Cell type:code id:eda9570c tags:

		``` python
		import pandas as pd
		import numpy as np
		from elasticsearch_connection import ElasticSearchConnection
		from kypo_data_preprocessor import KYPODataPreprocessor
		from kypo_elk_scripts_utils import parse_cmd, format_delta_time, filter_logs_in_level_range
		import re
		```

		%% Cell type:markdown id:c643116e tags:

		### Setup
		Here you should set up your data features.

		%% Cell type:code id:931ae3f1 tags:

		``` python
		esc = ElasticSearchConnection('http://localhost', '9200')
		esc.set_sandbox_id(['130', '131', '128', '129'])
		esc = KYPODataPreprocessor('http://localhost', '9200')
		esc.set_sandbox_ids(['130', '131', '128', '129'])
		esc.set_time_from('2020-04-05T17:20:00')
		esc.set_time_to('2020-06-07T17:40:00')
		```

		%% Cell type:markdown id:a7bb5249 tags:

		### Data test

		If the data are unclear and you are unsure which Sandbox IDs are included, use these methods (`sniff_events()`, `sniff_bash_logs()`) to explore basic information about data in Elasticsearch.
		If the data are unclear and you are unsure which Sandbox IDs are included, use these methods (`explore_events()`, `explore_bash_logs()`) to explore basic information about data in Elasticsearch.

		%% Cell type:code id:c5c2c930 tags:

		``` python
		esc.sniff_events()
		esc.explore_events()
		```

		%% Output

		timestamp \
		amin amax
		sandbox_id
		2 2020-05-06 13:25:14.192000+00:00 2020-05-07 16:11:18.210000+00:00
		14 2021-10-06 12:51:10.361000+00:00 2021-10-06 14:09:56.963000+00:00
		15 2021-10-08 07:55:11.913000+00:00 2021-10-08 10:12:30.217000+00:00
		16 2021-10-08 09:26:00.047000+00:00 2021-10-13 17:10:01.247000+00:00
		17 2021-10-08 14:24:46.476000+00:00 2021-10-08 15:09:42.204000+00:00
		... ... ...
		220 2020-05-08 17:29:01.279000+00:00 2020-05-10 08:16:12.641000+00:00
		239 2020-05-11 10:12:18.504000+00:00 2020-05-11 12:39:47.420000+00:00
		340 2020-05-14 08:04:40.413000+00:00 2020-05-14 10:00:33.557000+00:00
		341 2020-05-14 08:05:20.637000+00:00 2020-05-14 09:39:30.578000+00:00
		342 2020-05-14 08:28:57.394000+00:00 2020-05-14 09:45:12.721000+00:00

		event_type
		count
		sandbox_id
		2 128
		14 43
		15 39
		16 21
		17 23
		... ...
		220 44
		239 54
		340 33
		341 46
		342 35

		[71 rows x 3 columns]

		%% Cell type:code id:a819f247 tags:

		``` python
		esc.sniff_bash_logs()
		esc.explore_bash_logs()
		```

		%% Output

		timestamp \
		amin amax
		sandbox_id
		128 2020-05-06 16:20:12.098331+02:00 2020-05-06 17:37:03.137332+02:00
		129 2020-05-06 16:40:34.978581+02:00 2020-05-06 17:20:25.268125+02:00
		130 2020-05-06 16:26:28.378340+02:00 2020-05-06 17:05:35.286306+02:00
		131 2020-05-06 16:29:29.777667+02:00 2020-05-06 17:30:12.723408+02:00
		172 2020-05-06 19:01:41.358029+02:00 2020-05-07 14:31:01.169936+02:00
		185 2019-12-12 10:11:09+02:00 2019-12-12 11:28:13+02:00
		187 2019-12-12 10:17:36+02:00 2019-12-12 11:19:15+02:00
		188 2019-12-12 10:41:21+02:00 2019-12-12 11:56:51+02:00
		195 2019-12-12 11:38:03+02:00 2019-12-12 11:47:22+02:00
		340 2020-05-14 10:18:23.449882+02:00 2020-05-14 09:58:22.781889+00:00
		341 2020-05-14 10:20:38.723241+02:00 2020-05-14 11:35:39.423294+02:00
		342 2020-05-14 10:34:24.500753+02:00 2020-05-14 11:42:59.800874+02:00

		cmd_raw
		count
		sandbox_id
		128 36
		129 9
		130 10
		131 24
		172 145
		185 56
		187 59
		188 23
		195 6
		340 67
		341 15
		342 70

		%% Cell type:markdown id:e84e39d5 tags:

		### 2. Show used tools
		Definition

		For a given subset of command logs, show all the toolsthat are used in the logs (optionally with the counts of how often theywere used)

		Explanation

		The function `extract_used_tools` returns DataFrame with extra column `cmd` which contains programme from `cmd_raw` column. The following function `show_used_tools` prints to console each command count

		%% Cell type:code id:e5047046 tags:

		``` python
		def extract_used_tools(logs_df):
		logs_df['cmd'] = logs_df['cmd_raw'].apply(lambda x: parse_cmd(x)[0])
		return logs_df

		def show_used_tools(logs_df):
		print(extract_used_tools(logs_df)['cmd'].value_counts())
		```

		%% Cell type:code id:fb073f9a tags:

		``` python
		# Print to console
		logs_df = esc.fetch_bash_logs()
		show_used_tools(logs_df)

		# Create chart from updated data frame
		logs_df = extract_used_tools(logs_df)
		# logs_df['cmd'].value_counts().plot.bar(rot=0)
		```

		%% Output

		[!] Request problem in bash logs - Sandbox given on input does not exist
		Series([], Name: cmd, dtype: Int64)
		nmap 20
		python 20
		ls 7
		python3 5
		mv 3
		cd 3
		man 3
		for 2
		show 2
		msf 2
		msfconsole 2
		mfsconsole 2
		curl 1
		map 1
		top 1
		~htop 1
		metasploit 1
		grep 1
		script.py 1
		shooooooooooooow 1
		Name: cmd, dtype: int64

		%% Cell type:markdown id:58848fda tags:

		### 3. Show all combinations of arguments

		Definition

		For a given tool, show all the combinations (n-tuples) of its arguments that appear in the logs. Note that some arguments are standalone (e.g., –help), and some require an additional parameter(e.g., -p 20).

		Explanation

		The function `combinations_of_args` search for unique command parameters and returns them as an array of strings. The `show_combinations_of_args` prints formated output to console.

		%% Cell type:code id:8db63fab tags:

		``` python
		def combinations_of_args(logs_df, exp_cmd):
		opt_set = set()
		output = []
		for index, row in logs_df.iterrows():
		# print(' '.join(parse_cmd(row['cmd_raw'])[1]))
		parsed_line = parse_cmd(row['cmd_raw'])
		if parsed_line[0] == exp_cmd:
		opt_set.add(' '.join([parsed_line[0]] + parsed_line[1]))
		for opt in opt_set:
		opts = parse_cmd(opt)
		output.append(opts[1])
		return output


		def show_combinations_of_args(logs_df, exp_cmd):
		parsed_lines = combinations_of_args(logs_df, exp_cmd)
		for parsed_line in parsed_lines:
		print(exp_cmd + ' - ' + ' '.join(parsed_line))

		```

		%% Cell type:code id:402d9398 tags:

		``` python
		# Print to console
		logs_df = esc.fetch_bash_logs()
		show_combinations_of_args(logs_df, 'python')

		# Get output values
		print(combinations_of_args(logs_df, 'nmap'))
		```

		%% Output

		[!] Request problem in bash logs - Sandbox given on input does not exist
		[]
		python - 172.18.1.5:10000 id script.py
		python - "ls" http://172.18.1.5:10000 script.py
		python - "cat twofactor_form.cgi" http://172.18.1.5:10000 script.py
		python - 172.18.1.5:10000/session_login.cgi ls script.py
		python - "cat twofactor.pl" http://172.18.1.5:10000 script.py
		python - CVE_2019_15107.py https://10.10.20.166:10000
		python - http://172.18.1.5:10000 ls script.py
		python - https://172.18.1.5:10000 ls script.py
		python - "cat CHANGELOG" http://172.18.1.5:10000 script.py
		python - CVE_2019_15107.py https://10.10.20.166:10000 ls
		python - /root;ls cd http://172.18.1.5:10000 script.py
		python - "cat *" > http://172.18.1.5:10000 out.txt script.py
		python - 172.18.1.5:10000 ls script.py
		python - "cd /root;ls" http://172.18.1.5:10000 script.py
		python - "twofactor_form.cgi" http://172.18.1.5:10000 script.py
		python - "cat *" http://172.18.1.5:10000 script.py
		[['-p 172.18.1.5'], ['-p 10000', '172.18.1.5'], ['-n 172.18.1.0/24', 'sP'], ['172.18.1.5'], ['-p 80', '172.18.1.5'], [], ['-F 172.18.1.0/24', '-n ', 'sP'], ['-n 172.18.1.5', 'sP'], ['-n 172.18.1.0', 'sP'], ['-p 10000', '-sV 172.18.1.5'], ['-p 20000', '172.18.1.5'], ['-F 172.18.1.5'], ['-sV 172.18.1.5'], ['-65535 172.18.1.0/24', 'p0'], ['172.18.1.5', 'p']]

		%% Cell type:markdown id:5989531f tags:

		### 4: Equivalence classes of commands

		Definition

		For a given subset of command logs, show equivalence classes of commands (considering the order of arguments)

		Explanation

		The function `eq_classes_of_commands` normalizes the command parameters and returns the parameters with equivalent meaning. The following function `show_eq_classes_of_commands` print result to console.

		%% Cell type:code id:f4aa2670 tags:

		``` python
		def eq_classes_of_commands(logs_df):
		logs_df['cmd'] = logs_df['cmd_raw'].apply(lambda x: parse_cmd(x)[0])
		logs_df['norm_params'] = logs_df['cmd_raw'].apply(lambda x: ' '.join(parse_cmd(x)[1]))
		unique_cmds = logs_df.drop_duplicates(subset = ['cmd', 'norm_params'])
		eq_cmds = []
		for index, row in unique_cmds.iterrows():
		eq_rows = logs_df.loc[(logs_df['cmd'] == row['cmd']) & (logs_df['norm_params'] == row['norm_params'])]
		if eq_rows.drop_duplicates(subset = ['cmd_raw'])['cmd'].count() > 1:
		eq = []
		for index, row in eq_rows.iterrows():
		eq.append(row['cmd_raw'])
		eq_cmds.append(eq)
		return eq_cmds

		def show_eq_classes_of_commands(logs_df):
		for eq_cmds in eq_classes_of_commands(logs_df):
		print(' == '.join(eq_cmds))
		```

		%% Cell type:code id:10459ac8 tags:

		``` python
		logs_df = esc.fetch_bash_logs()
		show_eq_classes_of_commands(logs_df)
		```

		%% Output

		nmap -p 172.18.1.5 == nmap -p 172.18.1.5
		nmap -p- 172.18.1.5 == nmap -p- 172.18.1.5 == nmap -p- 172.18.1.5

		%% Cell type:markdown id:59da28c0 tags:

		### 5. Disregard the sudo

		Definition

		For commands with "sudo", disregard the "sudo" key-word.

		Explanation

		The function `remove_sudo_from_df` removes the `sudo` prefix in `cmd_raw` column and return `DataFrame`.

		%% Cell type:code id:f5fa1a27 tags:

		``` python
		def remove_sudo(cmd):
		cmd_with_options = cmd.split()
		return ' '.join(cmd_with_options[0:] if cmd_with_options[0] != 'sudo' else cmd_with_options[1:])

		def remove_sudo_from_df(logs_df):
		logs_df['cmd_raw'] = logs_df['cmd_raw'].apply(remove_sudo)
		return logs_df
		```

		%% Cell type:code id:84a529ce tags:

		``` python
		logs_df = esc.fetch_bash_logs()
		remove_sudo_from_df(logs_df)
		logs_df
		```

		%% Output

		sandbox_id timestamp \
		0 128 2020-05-06 16:20:12.098331+02:00
		1 128 2020-05-06 16:35:30.999179+02:00
		2 128 2020-05-06 16:37:35.317756+02:00
		3 128 2020-05-06 16:20:39.407920+02:00
		4 128 2020-05-06 16:24:25.830504+02:00
		.. ... ...
		74 131 2020-05-06 17:22:11.958857+02:00
		75 131 2020-05-06 17:24:08.072683+02:00
		76 131 2020-05-06 17:26:17.803323+02:00
		77 131 2020-05-06 17:26:24.386783+02:00
		78 131 2020-05-06 17:30:12.723408+02:00

		cmd_raw cmd_type
		0 nmap -p 80 172.18.1.5 bash-command
		1 nmap -p 172.18.1.5 bash-command
		2 nmap 172.18.1.5 bash-command
		3 nmap -p 20000 172.18.1.5 bash-command
		4 for i in {1..65535}; do nmap -p ${i} 172.18.1.... bash-command
		.. ... ...
		74 show exploits bash-command
		75 shooooooooooooow exploiiiiiiiiiiii2 bash-command
		76 show exploits bash-command
		77 msfconsole bash-command
		78 ls bash-command

		[79 rows x 4 columns]

		%% Cell type:markdown id:0c787020 tags:

		### 6. Show the total duration of the actions
		Definition

		For a given subset of commands, show the total durationof the actions.

		Explanation

		The function `actions_total_duration_for_sandbox` look for each sandbox minimal and maximal time value and returns it as `DataFrame`. The following `show_actions_total_duration_for_sandbox` prints to console output.

		%% Cell type:code id:56a0269a tags:

		``` python
		def actions_total_duration_for_sandbox(logs_df):
		return logs_df.groupby(['sandbox_id']).agg({'timestamp' : [np.min, np.max]})

		def show_actions_total_duration_for_sandbox(logs_df):
		actions = actions_total_duration_for_sandbox(logs_df)
		for index, row in actions.iterrows():
		delta = row.timestamp.amax - row.timestamp.amin
		print('Sandbox {} contains logs from {} to {} ({})'.format(row.name, row.timestamp.amin.strftime('%H:%M:%S %d-%m-%Y'), str(row.timestamp.amax.strftime('%H:%M:%S %d-%m-%Y')), format_delta_time(delta)))
		```

		%% Cell type:code id:f2a750c7 tags:

		``` python
		# Aggregated DF
		logs_df = esc.fetch_bash_logs()
		print(actions_total_duration_for_sandbox(logs_df))

		# Print to console
		logs_df = esc.fetch_bash_logs()
		show_actions_total_duration_for_sandbox(logs_df)
		```

		%% Output

		timestamp
		amin amax
		sandbox_id
		128 2020-05-06 16:20:12.098331+02:00 2020-05-06 17:37:03.137332+02:00
		129 2020-05-06 16:40:34.978581+02:00 2020-05-06 17:20:25.268125+02:00
		130 2020-05-06 16:26:28.378340+02:00 2020-05-06 17:05:35.286306+02:00
		131 2020-05-06 16:29:29.777667+02:00 2020-05-06 17:30:12.723408+02:00
		Sandbox 128 contains logs from 16:20:12 06-05-2020 to 17:37:03 06-05-2020 (1:16:51)
		Sandbox 129 contains logs from 16:40:34 06-05-2020 to 17:20:25 06-05-2020 (0:39:50)
		Sandbox 130 contains logs from 16:26:28 06-05-2020 to 17:05:35 06-05-2020 (0:39:06)
		Sandbox 131 contains logs from 16:29:29 06-05-2020 to 17:30:12 06-05-2020 (1:00:42)

		%% Cell type:markdown id:818c7ef5 tags:

		### 7. The duration between two actions

		Definition

		Find command logs for which the total duration is lower/higher than a given number.

		Explanation

		The function `actions_total_duration_for_sandbox_to_number` finds the sandboxes, where the difference between the first and last command log is lower or higher than the number in seconds given in parameters. Note that the last parameter must be `"lower"` or `"higher"`. The function `show_actions_total_duration_for_sandbox_to_number` prints result to the console.

		%% Cell type:code id:d2973578 tags:

		``` python
		def actions_total_duration_for_sandbox_to_number(logs_df, time_up, state):
		delta_interval = pd.Timedelta(str(time_up) + ' seconds')
		logs_df = logs_df.groupby(['sandbox_id']).agg({'timestamp' : [np.min, np.max]})
		logs_df['delta'] = (logs_df.timestamp.amax - logs_df.timestamp.amin).fillna(pd.Timedelta('0 seconds'))
		logs_df = logs_df.loc[(logs_df['delta'] <= delta_interval) if state == 'lower' else (logs_df['delta'] >= delta_interval) ]
		return logs_df

		def show_actions_total_duration_for_sandbox_to_number(logs_df, time_up, state):
		actions = actions_total_duration_for_sandbox_to_number(logs_df, time_up, state)
		for index, row in actions.iterrows():
		delta = row.timestamp.amax - row.timestamp.amin
		print('Sandbox {} contains logs from {} to {} ({})'.format(row.name, row.timestamp.amin.strftime('%H:%M:%S %d-%m-%Y'), str(row.timestamp.amax.strftime('%H:%M:%S %d-%m-%Y')), format_delta_time(delta)))
		```

		%% Cell type:code id:3e681062 tags:

		``` python
		# Aggregated DF
		logs_df = esc.fetch_bash_logs()
		actions_total_duration_for_sandbox_to_number(logs_df, 50*60, 'higher')

		# Print to console
		logs_df = esc.fetch_bash_logs()
		show_actions_total_duration_for_sandbox_to_number(logs_df, 50*60, 'lower')
		```

		%% Output

		Sandbox 129 contains logs from 16:40:34 06-05-2020 to 17:20:25 06-05-2020 (0:39:50)
		Sandbox 130 contains logs from 16:26:28 06-05-2020 to 17:05:35 06-05-2020 (0:39:06)

		%% Cell type:markdown id:2abf232e tags:

		### 8. Duration of the actions to the given number

		Definition

		For a given subset of commands, show the time differences between two successive actions.

		Example

		The function `command_time_duration` extends the `DataFrame` with one column, the time difference between the cmd and cmd before.

		%% Cell type:code id:f870d655 tags:

		``` python
		def command_time_duration(logs_df):
		logs_df = logs_df.sort_values(by = 'timestamp')
		logs_df['delta'] = (logs_df['timestamp']-logs_df['timestamp'].shift()).fillna(pd.Timedelta('0 seconds'))
		logs_df['delta'] = logs_df['delta'].apply(lambda x: format_delta_time(x))
		return logs_df
		```

		%% Cell type:code id:18ef239b tags:

		``` python
		logs_df = esc.fetch_bash_logs()
		command_time_duration(logs_df)
		```

		%% Output

		sandbox_id timestamp \
		0 128 2020-05-06 16:20:12.098331+02:00
		3 128 2020-05-06 16:20:39.407920+02:00
		4 128 2020-05-06 16:24:25.830504+02:00
		45 130 2020-05-06 16:26:28.378340+02:00
		46 130 2020-05-06 16:28:30.551719+02:00
		.. ... ...
		78 131 2020-05-06 17:30:12.723408+02:00
		32 128 2020-05-06 17:35:19.914772+02:00
		33 128 2020-05-06 17:36:01.954768+02:00
		34 128 2020-05-06 17:36:34.386631+02:00
		35 128 2020-05-06 17:37:03.137332+02:00

		cmd_raw cmd_type delta
		0 nmap -p 80 172.18.1.5 bash-command 0:00:00
		3 nmap -p 20000 172.18.1.5 bash-command 0:00:27
		4 for i in {1..65535}; do nmap -p ${i} 172.18.1.... bash-command 0:03:46
		45 sudo nmap -sP -n 172.18.1.0 bash-command 0:02:02
		46 sudo nmap -sP -n 172.18.1.0/24 bash-command 0:02:02
		.. ... ... ...
		78 ls bash-command 0:00:46
		32 python3 myscript.py bash-command 0:05:07
		33 python3 myscript.py bash-command 0:00:42
		34 python3 myscript.py bash-command 0:00:32
		35 python3 myscript.py bash-command 0:00:28

		[79 rows x 5 columns]

		%% Cell type:markdown id:7cb9072e tags:

		### 9. Two actions time difference to the given number

		Definition

		Find command logs for which there exists a time differ-ence between two successive actions that is lower/higher than a given number

		Explanation

		The function `command_time_duration_to_number` filters the commands if the time difference between two commands is lower or higher than the given number. The `show_command_time_duration_to_number` prints these commands to console. The third parameter in functions should be `"lower"` or `"higher"` depending on desired output data.

		%% Cell type:code id:1762e53f tags:

		``` python
		def command_time_duration_to_number(logs_df, time_up, state):
		delta_interval = pd.Timedelta(str(time_up) + ' seconds')
		logs_df = logs_df.sort_values(by = 'timestamp')
		logs_df['cmd_before'] = (logs_df['cmd_raw'].shift()).fillna('---')
		logs_df['delta'] = (logs_df['timestamp'] - logs_df['timestamp'].shift()).fillna(pd.Timedelta('0 seconds'))
		logs_df = logs_df.loc[(logs_df['delta'] <= delta_interval) if state == 'lower' else (logs_df['delta'] >= delta_interval) ]
		logs_df['delta'] = logs_df['delta'].apply(lambda x: format_delta_time(x))
		return logs_df

		def show_command_time_duration_to_number(logs_df, time_up, state):
		logs_df = command_time_duration_to_number(logs_df, time_up, state)
		for index, row in logs_df.iterrows():
		print('{} - {:60} --> {:40}'.format(str(row.delta), row.cmd_before, row.cmd_raw))
		```

		%% Cell type:code id:b1626390 tags:

		``` python
		logs_df = esc.fetch_bash_logs()
		command_time_duration_to_number(logs_df, 20, 'lower')

		logs_df = esc.fetch_bash_logs()
		show_command_time_duration_to_number(logs_df, 20, 'higher')
		```

		%% Output

		0:00:27 - nmap -p 80 172.18.1.5 --> nmap -p 20000 172.18.1.5
		0:03:46 - nmap -p 20000 172.18.1.5 --> for i in {1..65535}; do nmap -p ${i} 172.18.1.5 >> out.txt; done
		0:02:02 - for i in {1..65535}; do nmap -p ${i} 172.18.1.5 >> out.txt; done --> sudo nmap -sP -n 172.18.1.0
		0:02:02 - sudo nmap -sP -n 172.18.1.0 --> sudo nmap -sP -n 172.18.1.0/24
		0:00:59 - sudo nmap -sP -n 172.18.1.0/24 --> map -p 172.18.1.5
		0:00:36 - nmap -p 172.18.1.5 --> for i in {65535..1}; do nmap -p ${i} 172.18.1.5 >> out.txt; done
		0:00:32 - sudo nmap -sP -n -F 172.18.1.0/24 --> sudo nmap -sP -n 172.18.1.5
		0:01:36 - nmap -p- 172.18.1.5 --> nmap -p0-65535 172.18.1.0/24
		0:01:48 - nmap -p0-65535 172.18.1.0/24 --> nmap -F 172.18.1.5
		0:00:51 - nmap -F 172.18.1.5 --> nmap -p 172.18.1.5
		0:02:04 - nmap -p 172.18.1.5 --> nmap 172.18.1.5
		0:01:18 - nmap 172.18.1.5 --> nmap 172.18.1.5
		0:01:41 - nmap 172.18.1.5 --> nmap
		0:02:27 - nmap --> nmap -sV 172.18.1.5
		0:01:14 - nmap -sV 172.18.1.5 --> nmap -p 10000 172.18.1.5
		0:00:50 - man nmap --> nmap -p- 172.18.1.5
		0:00:22 - nmap -p- 172.18.1.5 --> nmap -sV -p 10000 172.18.1.5
		0:02:38 - nmap -sV -p 10000 172.18.1.5 --> ~htop
		0:00:26 - ls --> cd /root
		0:00:32 - cd /root --> ls Documents/
		0:02:29 - ls Downloads/ --> man nmap
		0:02:30 - man nmap --> python CVE_2019_15107.py https://10.10.20.166:10000 ls
		0:00:44 - python CVE_2019_15107.py https://10.10.20.166:10000 ls --> python CVE_2019_15107.py https://10.10.20.166:10000
		0:01:29 - python CVE_2019_15107.py https://10.10.20.166:10000 --> man metasploit
		0:00:55 - nmap -p- 172.18.1.5 --> metasploit
		0:03:54 - metasploit --> mv script script.py
		0:03:05 - mv script script.py --> curl -L https://10.10.20.166:10000/password_change.cgi
		0:01:23 - curl -L https://10.10.20.166:10000/password_change.cgi --> script.py 172.18.1.5:10000 ls
		0:01:59 - python script.py 172.18.1.5:10000 ls --> python script.py 172.18.1.5:10000/session_login.cgi ls
		0:00:22 - python script.py 172.18.1.5:10000/session_login.cgi ls --> python script.py 172.18.1.5:10000 ls
		0:01:17 - python script.py 172.18.1.5:10000 ls --> msf
		0:01:42 - msf --> python script.py 172.18.1.5:10000 id
		0:00:44 - python script.py 172.18.1.5:10000 ls --> python script.py https://172.18.1.5:10000 ls
		0:01:36 - python script.py http://172.18.1.5:10000 ls --> python script.py http://172.18.1.5:10000 cd /root;ls
		0:00:52 - python script.py http://172.18.1.5:10000 cd /root;ls --> python script.py http://172.18.1.5:10000 "cd /root;ls"
		0:00:21 - python script.py http://172.18.1.5:10000 "cd /root;ls" --> msf
		0:00:31 - msf --> python script.py http://172.18.1.5:10000 "cat *"
		0:01:09 - python script.py http://172.18.1.5:10000 "cat *" --> python script.py http://172.18.1.5:10000 "ls"
		0:01:01 - python script.py http://172.18.1.5:10000 "ls" --> msfconsole
		0:00:20 - msfconsole --> python script.py http://172.18.1.5:10000 "cat CHANGELOG"
		0:00:22 - python script.py http://172.18.1.5:10000 "cat CHANGELOG" --> python script.py http://172.18.1.5:10000 "ls"
		0:00:39 - python script.py http://172.18.1.5:10000 "ls" --> python script.py http://172.18.1.5:10000 "twofactor_form.cgi"
		0:00:27 - show exploits --> python script.py http://172.18.1.5:10000 "cat *" > out.txt
		0:01:29 - python script.py http://172.18.1.5:10000 "cat *" > out.txt --> shooooooooooooow exploiiiiiiiiiiii2
		0:02:03 - shooooooooooooow exploiiiiiiiiiiii2 --> grep "\d{5}" out.txt
		0:02:52 - msfconsole --> mv myscript.py
		0:00:46 - mv myscript myscript.py --> ls
		0:05:07 - ls --> python3 myscript.py
		0:00:42 - python3 myscript.py --> python3 myscript.py
		0:00:32 - python3 myscript.py --> python3 myscript.py
		0:00:28 - python3 myscript.py --> python3 myscript.py

		%% Cell type:markdown id:e4218a64 tags:

		### 10. Queries for command histories

		Definition

		For a given subset of command logs, compute simpledescriptive statistics (sum, min, max, median, avg, stddev).

		Explanation

		The function `bash_log_statistic` gets parameters as an instance of `ElasticSearchConnection` and two level ids - level_from and level_to. It returns statistics throw all found sandboxes between these levels (included the level itself). The function returns the dictionary with keys - `min`, `max`, `mean`, `median` and `std`. Note that `min` and `max` contain tuples with sandbox_id. The function `show_bash_log_statistic` print the result to the console.

		%% Cell type:code id:95af63e3 tags:

		``` python
		def bash_log_statistic(elk_connection, level_from, level_to):
		events_df = elk_connection.fetch_events()
		logs_df = elk_connection.fetch_bash_logs()
		filtered_logs_df = filter_logs_in_level_range(logs_df, events_df, level_from, level_to)
		logs_stats_df = filtered_logs_df.groupby(['sandbox_id'])['cmd_raw'].count().reset_index()
		min_row = logs_stats_df[logs_stats_df['cmd_raw'] == logs_stats_df['cmd_raw'].min()].reset_index()
		max_row = logs_stats_df[logs_stats_df['cmd_raw'] == logs_stats_df['cmd_raw'].max()].reset_index()
		return {
		'min': (min_row.loc[0]['sandbox_id'], min_row.loc[0]['cmd_raw']),
		'max': (max_row.loc[0]['sandbox_id'], max_row.loc[0]['cmd_raw']),
		'mean': logs_stats_df['cmd_raw'].mean(),
		'median': logs_stats_df['cmd_raw'].median(),
		'std': logs_stats_df['cmd_raw'].std(),
		}

		def show_bash_log_statistic(elk_connection, level_from, level_to):
		stats = bash_log_statistic(elk_connection, level_from, level_to)
		print('STATISTICS FOR BASH LOGS \n')
		print(' {:10} {} cmds (sandbox {})'.format('MIN', stats['min'][1], stats['min'][0]))
		print(' {:10} {} cmds (sandbox {})'.format('MAX', stats['max'][1], stats['max'][0]))
		print(' {:10} {} cmds'.format('MEAN', stats['mean']))
		print(' {:10} {} cmds'.format('MEDIAN', stats['median']))
		print(' {:10} {:.2f} cmds'.format('STD', stats['std']))
		```

		%% Cell type:code id:77881922 tags:

		``` python
		print(bash_log_statistic(esc, 61, 62))
		show_bash_log_statistic(esc, 61, 62)
		```

		%% Output

		{'min': (129, 6), 'max': (131, 17), 'mean': 9.5, 'median': 7.5, 'std': 5.066228051190222}
		STATISTICS FOR BASH LOGS

		MIN 6 cmds (sandbox 129)
		MAX 17 cmds (sandbox 131)
		MEAN 9.5 cmds
		MEDIAN 7.5 cmds
		STD 5.07 cmds

kypo_data_preprocessor.py

0 → 100644

+226 −0

File added.

Preview size limit exceeded, changes collapsed.