Ruixuan
Tu, Feijun Chen
{ruixuan.tu, fchen222}@wisc.edu
University of Wisconsin-Madison
APIs (all return 0
on success or
errno
on error)
void *mythread(void *arg) { ... }; // example of start_routine
int pthread_create(pthread_t *thread, NULL, void *(*start_routine)(void *), void *arg); // e.g., (&p1, NULL, mythread, "A")
int pthread_join(pthread_t *thread, NULL); // wait for thread to finish; e.g., (p1, NULL)
= PTHREAD_MUTEX_INITIALIZER; // init a lock
pthread_mutex_t mutex int pthread_mutex_lock/unlock(pthread_mutex_t *mutex);
= PTHREAD_COND_INITIALIZER; // init a condition variable
pthread_cond_t cond int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex); // assume mutex is locked, release mutex and put caller to sleep (not ready); when signaled, reacquire mutex before returning
int pthread_cond_signal(pthread_cond_t *cond);
; // semaphore
sem_t semint sem_init(sem_t *sem, 0, unsigned int value); // 0: shared between threads in same process
int sem_wait(sem_t *sem); // sem->value--; wait if sem->value < 0
int sem_post(sem_t *sem); // sem->value++; wake up one or more waiting threads
Threads (Ch. 26)
Locks (Ch. 28)
criterias
coarse-grained (big lock that is used any time any critical section is accessed); fine-grained (protect different data structures with different locks, allowing more threads in locked code at once)
controlling interrupts. good: simplicity. bad: priviledged operation with trust (monopolize CPU over OS), does not work on multiprocessors (enter on another CPU), lost interrupts (e.g., disk), inefficient
void lock() { DisableInterrupts(); }
void unlock() { EnableInterrupts(); }
spin lock: use CPU cycles until lock available, requires a preemptive scheduler (i.e., interrupt via timer). good: correctness (mutex). bad: fairness (no guarantee), performance (overhead on single CPU, good when \(\text{\# threads}\approx \text{\# CPUs}\))
test-and-set or atomic exchange
(XCHG
)
int TestAndSet(int *old_ptr, int new_value) {
int old_value = *old_ptr; // fetch old value at old_ptr
*old_ptr = new_value; // store 'new_value' into old_ptr
return old_value; // return the old value
}
typedef struct __lock_t { int flag; } lock_t;
void init(lock_t *lock) {
// 0: lock is available, 1: lock is held
->flag = 0;
lock}
void lock(lock_t *lock) {
while (TestAndSet(&lock->flag, 1) == 1)
; // spin-wait (do nothing)
}
void unlock(lock_t *lock) { lock->flag = 0; }
compare-and-swap/compare-and-exchange
(CMPXCHG
)
int CompareAndSwap(int *ptr, int expected, int new_value) {
int original = *ptr;
if (original == expected)
*ptr = new_value;
return original;
}
void lock(lock_t *lock) {
while (CompareAndSwap(&lock->flag, 0, 1) == 1)
; // spin
}
TestAndSet
when using spin lock, but
provides lock-free synchronizationload-linked and store-conditional (RISC)
int LoadLinked(int *ptr) { return *ptr; }
int StoreConditional(int *ptr, int value) {
if (no update to *ptr since LoadLinked to this address) {
*ptr = value;
return 1; // success
} else return 0; // failed to update
}
void lock(lock_t *lock) {
while (1) {
while (LoadLinked(&lock->flag) == 1)
; // spin until it is 0
if (StoreConditional(&lock->flag, 1) == 1)
return; // if set-it-to-1 succeeded: all done
// otherwise: try again
}
}
void unlock(lock_t *lock) { lock->flag = 0; }
ticket locks
store ticket
, turn
(which process to
enter critical section)
fetch-and-add (XADD
) and
yield
int FetchAndAdd(int *ptr) { int old = *ptr; *ptr = old + 1; return old; }
typedef struct __lock_t { int ticket; int turn; } lock_t;
void lock_init(lock_t *lock) { lock->ticket = 0; lock->turn = 0; }
void lock(lock_t *lock) {
int myturn = FetchAndAdd(&lock->ticket);
while (lock->turn != myturn)
(); // spin(): discussed above
yield}
void unlock(lock_t *lock) { lock->turn = lock->turn + 1; }
test-and-set and yield
void init() { flag = 0; }
void lock() {
while (TestAndSet(&flag, 1) == 1)
(); // give up CPU
yield}
void unlock() { flag = 0; }
queues: sleeping instead of spinning
typedef struct __lock_t { int flag; int guard; queue_t *q; } lock_t;
void lock_init(lock_t *m) { m->flag = 0; m->guard = 0; queue_init(m->q); }
void lock(lock_t *m) {
while (TestAndSet(&m->guard, 1) == 1)
; // acquire guard lock by spinning
if (m->flag == 0) {
->flag = 1; // lock is acquired
m->guard = 0;
m} else {
(m->q, gettid());
queue_add(); // if then interrupted, then park() will return immediately, avoid wakeup race
setpark->guard = 0;
m(); // deschedule caller
park}
}
void unlock(lock_t *m) {
while (TestAndSet(&m->guard, 1) == 1)
; // acquire guard lock by spinning
if (queue_empty(m->q)) m->flag = 0; // let go of lock; no one wants it
else unpark(queue_remove(m->q)); // hold lock for and wake up next thread
->guard = 0;
m}
good: no waste, avoid starvation
bad: (limited) if interrupted in
acquiring/releasing lock, then other threads spin-wait for this to run
again; (without setpark()
– about to sleep) wakeup
race if another thread released the lock, the
park()
by this thread sleep forever
park()
and unpark()
switch state
between running and waiting or sleep (not
ready)
Linux-based futex locks
void futex_wait(void *address, int expected); // if *address != expected, return immediately, else sleep caller
void futex_wake(void *address); // wake up one thread sleeping on queue
two-phase lock
Locked Data Structures (Ch. 29)
concurrent counter
typedef struct __counter_t {
int global; // global count
; // global lock
pthread_mutex_t glockint local[NUMCPUS]; // per-CPU count
[NUMCPUS]; // ... and locks
pthread_mutex_t llockint threshold; // update frequency
} counter_t;
void init(counter_t *c, int threshold); // record threshold, init locks, init values of all local counts and global count
void update(counter_t *c, int threadID, int amt); // usually, just grab local lock and update local amount; once local count has risen 'threshold', grab global lock and transfer local values to it
int cpu = threadID % NUMCPUS; // map thread ID to CPU ID
int get(counter_t *c); // grab global lock and return global amount (approximate)
concurrent queue
typedef struct __node_t { int value; struct __node_t *next; } node_t;
typedef struct __queue_t { node_t *head; node_t *tail; pthread_mutex_t head_lock; pthread_mutex_t tail_lock; } queue_t;
void Queue_Enqueue(queue_t *q, int value); // new tmp node; lock tail; add to tail; unlock tail
int Queue_Dequeue(queue_t *q, int *value); // lock head; remove from head (or empty); unlock head
concurrent linked list hand-over-hand locking/lock coupling: a lock per node, grab next node’s lock and release current node’s lock
Condition Variables (Ch. 30)
condition variable: an explicit queue that threads can put themselves on when some condition is not desired (by waiting on condition); when some other thread changes state, can wake one or multiple waiting threads (might not all) and allow them to continue (by signaling on condition)
rules
signal()
to
avoid race conditionssignal()
not called; signal()
may wake up more than one thread)while
, not
if
when waiting on a condition variable, and
wait()
when not satisfiedjoin()
implementation
void thread_exit(thread_t *t) {
(&t->mutex);
mutex_lock->done = 1; // might already terminated before join()
t(&t->cond);
cond_signal(&t->mutex);
mutex_unlock}
void thread_join(thread_t *t) {
(&t->mutex);
mutex_lockwhile (t->done == 0) // rule (3)
(&t->cond, &t->mutex);
cond_wait(&t->mutex);
mutex_unlock}
producer/consumer (bounded buffer) problem
put and get routines
int buffer[MAX];
int fill_ptr = 0, use_ptr = 0, count = 0;
void put(int value) {
[fill_ptr] = value;
buffer= (fill_ptr + 1) % MAX;
fill_ptr ++;
count}
int get() {
int tmp = buffer[use_ptr];
= (use_ptr + 1) % MAX;
use_ptr --;
countreturn tmp;
}
producer/consumer synchronization
, fill; mutex_t mutex;
cond_t emptyvoid *producer(void *arg) {
int i;
for (i = 0; i < loops; i++) {
(&mutex);
pthread_mutex_lockwhile (count == MAX)
(&empty, &mutex);
pthread_cond_wait(i);
put(&fill);
pthread_cond_signal(&mutex);
pthread_mutex_unlock}
}
void *consumer(void *arg) {
int i;
for (i = 0; i < loops; i++) {
(&mutex);
pthread_mutex_lockwhile (count == 0)
(&fill, &mutex);
pthread_cond_waitint tmp = get();
(&empty);
pthread_cond_signal(&mutex);
pthread_mutex_unlock("%d\n", tmp);
printf}
}
Mesa semantics: when you call
signal()
, you do not immediately switch to a waiting thread
but a waiting thread will instead be marked as ready
problems: (1) no data when consumer awake (after
another consumer), solve by while
; (2) all sleep (after
producer filled data and a consumer exhausted data, then wake another
consumer), solve by that a consumer/producer should not wake other
consumers/producers, by fill
and empty
; (3)
only one thread can fill or use a buffer at a time, solve by unlock when
fill or use next buffer, lock before update count
Semaphores (Ch. 31)
1
,
sem_wait()
as lock()
, sem_post()
as unlock()
0
, parent runs and calls sem_wait()
to sleep
(value == -1), child runs and calls sem_post()
to wake
parent (value == 0)count
, instead semaphores empty
and
full
, full, mutex;
sem_t emptyvoid *producer(void *arg) {
int i;
for (i = 0; i < loops; i++) {
(&empty);
sem_wait(&mutex); // not outer to avoid deadlock
sem_wait(i);
put(&mutex); sem_post(&full);
sem_post}
}
void *consumer(void *arg) {
int i;
for (i = 0; i < loops; i++) {
(&full); sem_wait(&mutex);
sem_waitint tmp = get();
(&mutex); sem_post(&empty);
sem_post("%d\n", tmp);
printf}
}
int main() {
(&empty, 0, MAX); // MAX are empty
sem_init(&full, 0, 0); // 0 are full
sem_init(&mutex, 0, 1); // lock
sem_init}
Bugs (Ch. 32)
\(1\) s == \(10^3\) ms == \(10^6\) μs
IN
and
OUT
in x86); (2) memory-mapped I/O, device
registers mapped into memory, != mmapsem_wait(device_ready)
, when ready use
sem_post(device_ready)
to issue interruptsurface 3, track 5, sector 7
; platters spin
to \(\phi\) by spindle (rpm), arms
assembly moves to \(r\)
simultaneously, only one head R/W at one timecomparison: \(N\) disks each with \(B\) blocks, \(S\) sequential bandwidth of a disk, \(R\) random bandwidth of a disk, \(T\) time a request to a single disk would take
reasons for multiple drives: (1) disk failure might occur, (2) capacity is not enough, (3) improve performance
RAID-0 – striping (no redundancy): store the data evenly across the disks
layout with chunk size = 1
disk 0 disk 1 disk 2 disk 3
0 1 2 3
4 5 6 7
logical address A
,
disk_id = A % disk_count
,
offset = A / disk_count
read / write: direct read / write, each issues 1 I/O
RAID-1 – mirroring: have 2 copies of each block on different disks; (!) issuing large I/O requests to different parts of each mirror could achieve full bandwidth
layout
disk 0 disk 1 disk 2 disk 3
0 0 1 1
2 2 3 3
read: directly read one of the copies, 1 I/O; write: write to all copies in parallel, \(M\) (mirroring level) I/O; recovery: when a disk fails, there is another copy of data to be used
RAID-4 – saving space with parity: use a disk as a parity disk, each bit stores the parity information about the other bits in that position on other disks
layout
disk 0 disk 1 disk 2 disk 3
0 1 2 P0
3 4 5 P1
6 7 8 P2
read: direct read, issue 1 I/O; parallel read at most \(N - 1\) since one disk is parity disk
write (use either): (1) read other blocks and compute parity; write the block to be changed and the new parity block. (2) read old data and old parity, then compute new parity and write 2 blocks. (!) both need 2 reads and 2 writes (with subtractive parity), or \(N-1\) reads and 2 writes (with additive parity)
parity computation: parity of a row is the XOR of all the bits in that row
RAID-5 – rotated parity: store the parity block on different disks sequentially in a rotated manner (e.g. first parity block on last disk, second one on second last)
layout
disk 0 disk 1 disk 2 disk 3
0 1 2 P0
3 4 P1 5
6 P2 7 8
read: same as RAID-4, random better as used all disks; write: same as RAID-4, random much better as allows request parallelism
creat()
system callwrite()
system call; calls
lseek()
to the end of file (set the offset to point to the
end of file) then write thereO_TRUNC
flag in
open()
unlink()
. user-level
cmd rm
rename()
. user-level
cmd mv
ln
ln -s
/
//////
== /
.
refers to current dir; ..
refers to
parent dirSidIDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
S
): contains info about
the entire FS; tells where the other block regions arei
for inode,
d
for data): tracks if a block is freeI
, for 32 inodes with
128 bytes each): stores type (regular file, dir, sym link), ownership,
access rights, size, # blocks, pointers to data blocks (direct
ptrs having a fixed number of blocks that points to the address of
the data block, thus have a max file size limit; indirect ptrs
usually is the last ptr in the array of direct ptrs, pointing to another
data block that is full of direct ptrs)D
): store data
onlymkfs()
: creates an empty
file system (just a root directory)write(fd, buffer, size)
: (1) allocate a data
block: (1-1) read the data bitmap and find a free block; (1-2) write to
data bitmap. (2) update inode: (2-1) read reelevant inode block and
update inode; (2-2) write inode block back. (3) write data to data
block.
and ..
? dir points to
valid inodes? inode size
and nblocks
match?
free bitmap? # dir entries == inode link count (update link count + mv
to /lost+found
)? different inodes point to same block
(duplicate block)? bad ptrs (remove ref)?GETATTR
,
SETATTR
, LOOKUP
, READ
,
WRITE
, CREATE
, REMOVE
,
MKDIR
, RMDIR
, READDIR
; could
accelerate by client-side caching (inconsistency in 3 sec before cache
timeout, must flush-on-close), but not server-side write buffering