In [1]:
# Code attribution: Yiyin Shen, Tyler Caraza-Harter
# Imports
import re
import matplotlib.pyplot as plt
In [2]:
# Run "git log > git-log.txt" to get the text file containing the git log of CS320-F23 repo.
with open("git-log.txt", encoding='utf-16') as f:
    text = f.read()
print(text[0:320])
commit ff8d4fcfdaf6dac2ba5a0c52c166af7fdb6e3c7d
Author: Young Wu <wu489@wisc.edu>
Date:   Sun Oct 29 20:32:39 2023 -0500

    Add files via upload

commit 61f95acad3436190c914c8466fa292bba752190b
Author: Young Wu <wu489@wisc.edu>
Date:   Sun Oct 29 20:29:27 2023 -0500

    Update README.md

commit 5a0d3a025c02aaea5c066
In [3]:
# Find the commit numbers (40 hex digits)
commits = re.findall(r"[0-9a-fA-F]{40}", text)
commits[0:10]
Out[3]:
['ff8d4fcfdaf6dac2ba5a0c52c166af7fdb6e3c7d',
 '61f95acad3436190c914c8466fa292bba752190b',
 '5a0d3a025c02aaea5c06633aec126ed523c12dd1',
 '1f7589a6a342cd05775d616766512da66d83326b',
 '46310d481036b71421318e991a2333516dfd3a87',
 'd3cff3c9bfca1e083851bb9997e7054e68d37697',
 '908c27b3fc58ceabe2429a885746b6008581d757',
 '5b583c88a6c178011564fafe031153153a0b4e6e',
 '5a97bcc334ddf0ede8fc1300061bc3f8b2627282',
 'dd853ffabd1f4462cf067ee86e4e0d3ec6213f11']
In [4]:
# Find the emails (something@something.something)
emails = re.findall(r"<.*@.*\..*>", text)
emails[0:10]
Out[4]:
['<wu489@wisc.edu>',
 '<wu489@wisc.edu>',
 '<wu489@wisc.edu>',
 '<sophia1998shen@gmail.com>',
 '<sophia1998shen@gmail.com>',
 '<wu489@wisc.edu>',
 '<wu489@wisc.edu>',
 '<sophia1998shen@gmail.com>',
 '<sophia1998shen@gmail.com>',
 '<sophia1998shen@gmail.com>']
In [5]:
# Find the email without the < ... >
actual_emails = re.findall(r"<(.+@.+\..+)>", text)
actual_emails[0:10]
Out[5]:
['wu489@wisc.edu',
 'wu489@wisc.edu',
 'wu489@wisc.edu',
 'sophia1998shen@gmail.com',
 'sophia1998shen@gmail.com',
 'wu489@wisc.edu',
 'wu489@wisc.edu',
 'sophia1998shen@gmail.com',
 'sophia1998shen@gmail.com',
 'sophia1998shen@gmail.com']
In [6]:
# Find the email ids before the @...
email_ids = re.findall(r"<(.+)@.*\..*>", text)
plt.hist(email_ids)
Out[6]:
(array([13.,  0., 38.,  0.,  0.,  1.,  0.,  1.,  0.,  2.]),
 array([0. , 0.4, 0.8, 1.2, 1.6, 2. , 2.4, 2.8, 3.2, 3.6, 4. ]),
 <BarContainer object of 10 artists>)
No description has been provided for this image
In [7]:
# Find the dates and times (e.g. Fri Sep 1 00:00:00)
dates = re.findall(r"((\w{3})\s\w{3}\s\d{1,2})\s(\d{2}:\d{2}:\d{2})\s\d{4}", text)
dates[0:10]
Out[7]:
[('Sun Oct 29', 'Sun', '20:32:39'),
 ('Sun Oct 29', 'Sun', '20:29:27'),
 ('Sun Oct 29', 'Sun', '20:28:57'),
 ('Tue Oct 24', 'Tue', '07:02:14'),
 ('Tue Oct 24', 'Tue', '07:02:07'),
 ('Sun Oct 22', 'Sun', '11:40:10'),
 ('Sun Oct 22', 'Sun', '11:39:41'),
 ('Thu Oct 19', 'Thu', '08:01:05'),
 ('Thu Oct 19', 'Thu', '00:59:41'),
 ('Thu Oct 19', 'Thu', '00:59:34')]
In [8]:
# Plot a histogram of days
days = list(map(lambda x: x[1], dates))
labels = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
counts = list(map(lambda x: days.count(x), labels))
plt.bar(labels, counts)
Out[8]:
<BarContainer object of 7 artists>
No description has been provided for this image