In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import re
import matplotlib.pyplot as plt
In [2]:
# Run "git log > git-log.txt" to get the text file containing the git log of CS320-F23 repo.
with open("git-log.txt", encoding='utf-16') as f:
text = f.read()
print(text[0:320])
commit ff8d4fcfdaf6dac2ba5a0c52c166af7fdb6e3c7d Author: Young Wu <wu489@wisc.edu> Date: Sun Oct 29 20:32:39 2023 -0500 Add files via upload commit 61f95acad3436190c914c8466fa292bba752190b Author: Young Wu <wu489@wisc.edu> Date: Sun Oct 29 20:29:27 2023 -0500 Update README.md commit 5a0d3a025c02aaea5c066
In [3]:
# Find the commit numbers (40 hex digits)
commits = re.findall(r"[0-9a-fA-F]{40}", text)
commits[0:10]
Out[3]:
['ff8d4fcfdaf6dac2ba5a0c52c166af7fdb6e3c7d', '61f95acad3436190c914c8466fa292bba752190b', '5a0d3a025c02aaea5c06633aec126ed523c12dd1', '1f7589a6a342cd05775d616766512da66d83326b', '46310d481036b71421318e991a2333516dfd3a87', 'd3cff3c9bfca1e083851bb9997e7054e68d37697', '908c27b3fc58ceabe2429a885746b6008581d757', '5b583c88a6c178011564fafe031153153a0b4e6e', '5a97bcc334ddf0ede8fc1300061bc3f8b2627282', 'dd853ffabd1f4462cf067ee86e4e0d3ec6213f11']
In [4]:
# Find the emails (something@something.something)
emails = re.findall(r"<.*@.*\..*>", text)
emails[0:10]
Out[4]:
['<wu489@wisc.edu>', '<wu489@wisc.edu>', '<wu489@wisc.edu>', '<sophia1998shen@gmail.com>', '<sophia1998shen@gmail.com>', '<wu489@wisc.edu>', '<wu489@wisc.edu>', '<sophia1998shen@gmail.com>', '<sophia1998shen@gmail.com>', '<sophia1998shen@gmail.com>']
In [5]:
# Find the email without the < ... >
actual_emails = re.findall(r"<(.+@.+\..+)>", text)
actual_emails[0:10]
Out[5]:
['wu489@wisc.edu', 'wu489@wisc.edu', 'wu489@wisc.edu', 'sophia1998shen@gmail.com', 'sophia1998shen@gmail.com', 'wu489@wisc.edu', 'wu489@wisc.edu', 'sophia1998shen@gmail.com', 'sophia1998shen@gmail.com', 'sophia1998shen@gmail.com']
In [6]:
# Find the email ids before the @...
email_ids = re.findall(r"<(.+)@.*\..*>", text)
plt.hist(email_ids)
Out[6]:
(array([13., 0., 38., 0., 0., 1., 0., 1., 0., 2.]), array([0. , 0.4, 0.8, 1.2, 1.6, 2. , 2.4, 2.8, 3.2, 3.6, 4. ]), <BarContainer object of 10 artists>)
In [7]:
# Find the dates and times (e.g. Fri Sep 1 00:00:00)
dates = re.findall(r"((\w{3})\s\w{3}\s\d{1,2})\s(\d{2}:\d{2}:\d{2})\s\d{4}", text)
dates[0:10]
Out[7]:
[('Sun Oct 29', 'Sun', '20:32:39'), ('Sun Oct 29', 'Sun', '20:29:27'), ('Sun Oct 29', 'Sun', '20:28:57'), ('Tue Oct 24', 'Tue', '07:02:14'), ('Tue Oct 24', 'Tue', '07:02:07'), ('Sun Oct 22', 'Sun', '11:40:10'), ('Sun Oct 22', 'Sun', '11:39:41'), ('Thu Oct 19', 'Thu', '08:01:05'), ('Thu Oct 19', 'Thu', '00:59:41'), ('Thu Oct 19', 'Thu', '00:59:34')]
In [8]:
# Plot a histogram of days
days = list(map(lambda x: x[1], dates))
labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
counts = list(map(lambda x: days.count(x), labels))
plt.bar(labels, counts)
Out[8]:
<BarContainer object of 7 artists>