class MWFileRC : public MWRMComm

A Resource Management and Communication class that uses Condor for underlying support of resource managament

Inheritance:


Public Methods

void CheckLogFilesRunning( )
Always get the last result
int handle_finished_worker( int i )
Handle a message from the worker
NUM_FILE_TYPES MWFileRC( bool val, int id )
Constructor
~MWFileRC()
Destructor

Public

A. Resource Management Routines
int setup( int argc, char *argv[], int *mytid, int *mastertid )
Initialises
void exit( int exitval )
Shutdown
int init_beginning_workers( int *nworkers, MWWorkerID ***workers )
Initialize workers if already some have started up
int start_worker( MWWorkerID *w )
This function is actually a misonomer
int removeWorker( MWWorkerID *w )
This function removes a existing worker
int hostaddlogic( int *w )
Figure out whether or not to generate a new worker depending on whether new requests have been made
int config( int *, int *, MWWorkerID *** )
A dummy function
int read_RMstate( FILE *fp = NULL )
A routine for reading in the MW-File state at the time of checkpointing
int write_RMstate( FILE *fp = NULL )
A routine for writing in the MW-File state at the time of checkpointing
B. Communication Routines
int initsend( int useless = 0 )
Initialize the send buffer
int send( int toWhom, int msgtag )
Send function
int recv( int fromWhom, int msgtag )
Recv function
int bufinfo( int buf_id, int *len, int *tag, int *sending_host )
Get some info about the recv buffer
int pack( char *bytes, int nitem, int stride = 1 )
pack some bytes
int pack( float *f, int nitem, int stride = 1 )
float
int pack( double *d, int nitem, int stride = 1 )
double
int pack( int *i, int nitem, int stride = 1 )
int
int pack( unsigned int *ui, int nitem, int stride = 1 )
unsigned int
int pack( short *sh, int nitem, int stride = 1 )
short
int pack( unsigned short *ush, int nitem, int stride = 1 )
unsigned short
int pack( long *l, int nitem, int stride = 1 )
long
int pack( unsigned long *ul, int nitem, int stride = 1 )
unsigned long
int pack( char *str )
string
int unpack( char *bytes, int nitem, int stride = 1 )
Unpack some bytes
int unpack( float *f, int nitem, int stride = 1 )
float
int unpack( double *d, int nitem, int stride = 1 )
double
int unpack( int *i, int nitem, int stride = 1 )
int
int unpack( unsigned int *ui, int nitem, int stride = 1 )
unsigned int
int unpack( short *sh, int nitem, int stride = 1 )
short
int unpack( unsigned short *ush, int nitem, int stride = 1 )
unsigned short
int unpack( long *l, int nitem, int stride = 1 )
long
int unpack( unsigned long *ul, int nitem, int stride = 1 )
unsigned long
int unpack( char *str )
string

Private Fields

struct FileWorker *fileWorkers
An array of the number of workers
int actual_task
The actual task that is being executed
int CHECKLOG_FREQ
This determines with what frequency should we check the log files
long long checksum
The checksum calculated
char control_directory[256]
The control directory the master reads for all resource management functions
int current_num_workers
The number of workers that were present
int cyclePosition
This is a variable that keeps the cycle in effect
int expected_number
The expected number of the next message
int FileRCID
The value is the id that the worker gets
int* hostadd_reqs
A variable array keeping track of how many have been requested
int** hostaddind_reqs
A variable to keep track of which of each have been requested
char init_file[256]
The init file
char input_directory[256]
The directory where the master has to send the work
bool isMaster
The bool indicates the mode of the RC instance a true value means that it is a master and a false means that it is a worker
int master_expected_number
The expected_number of the next message that the master is expecting
int MasterUp
Internal variable that is init to what the worker has to send to the upper layer in case of a master wakeup
int max_num_workers
An integer corresponding to the number of fileWorkers struct that we had/have as the maximum;
char moment_worker_file[256]
The file in the control directory that will contain the momentary number of workers
int msgTag
The tag of the message that just came in
char output_directory[256]
The directory where all the workers have to send their output
List* recvList
A list of all the items that are received
List* sendList
A list of all the items that are sent in a send after a series of packs
int subId
A variable to keep track of submit files
int submitted_num_workers
The number of submitted workers
int turnNo
This keeps the track of how many cycles were made
int whomRecv
The message came from whom
long worker_timeout
The worker timeout in minutes
int* workerArch
A variable array helping in choosing the next arch to chose from
FileRCEvent* workerEvents
An array of message tags to be sent to the upper layers as to what tag has to be sent with the event associated with this worker

Private Methods

void CheckLogFilesResuscicate( )
Is Called when we are recovering from a crash
int ChoseArchNum()
Get a architecture number
int do_spawn( int numworkers, int arch )
Create a worker
void GetCondorId( char *lgfile, int *cId, int *pId )
Find the condor_ID of the worker with this log file
int GetCounter( char *file1, char *file2 )
Another helping routine
int GetMasterExpectedNumber( char *file )
Another helping routine
int GetWorkerCounter( char *file )
The helping routine for resuscicate
int handle_executing_worker( int i )
Called when the worker starts executing first
int handle_killed_worker( int i )
Is Called when a task is dead
int handle_master_executing()
Is called when the worker detects that the master has come up
int handle_resumed_worker( int i )
Is Called when a host is resumed
int handle_suspended_worker( int i )
Is Called when a host is suspended
int handle_transited_worker( int i )
Called in no_checkpoint mode
int handle_work( int msgtag )
Is Called when a message is received by the worker
void inform_target_num_workers( )
Information by the upper layer
void InitStructures( )
A function that inits some internal structures
bool IsComplete( int i )
Determine whether worker i has sent a message
void killWorker( int i )
Kill a worker
int master_recv( int fromWhom, int msgtag )
Master receive
void resuscicate( )
The main resuscicate function
int setup_notifies( int worker_id )
Notify various events
int worker_recv( int fromWhom, int msgtag )
Worker receive

Private

Polls the log files to see some changes
Some private functions

Documentation

A Resource Management and Communication class that uses Condor for underlying support of resource managament. Some crude inter-process communication is provided using the userlog feature of Condor. resource management.
NUM_FILE_TYPES MWFileRC( bool val, int id )
Constructor

~MWFileRC()
Destructor

A. Resource Management Routines
Here we implement the pure virtual functions found in ur parent class, MWRMComm.

int setup( int argc, char *argv[], int *mytid, int *mastertid )
Initialises. Depending on whether it is master or worker instance it initializes all the internal variables.

void exit( int exitval )
Shutdown. Kills all the workers if it is master

int init_beginning_workers( int *nworkers, MWWorkerID ***workers )
Initialize workers if already some have started up

int start_worker( MWWorkerID *w )
This function is actually a misonomer. It DOES NOT spawn a new worker. Rather it just inits the structure that is passed on to it

int removeWorker( MWWorkerID *w )
This function removes a existing worker

int hostaddlogic( int *w )
Figure out whether or not to generate a new worker depending on whether new requests have been made

int config( int *, int *, MWWorkerID *** )
A dummy function. For the sake of conformity with the Pvm

int read_RMstate( FILE *fp = NULL )
A routine for reading in the MW-File state at the time of checkpointing

int write_RMstate( FILE *fp = NULL )
A routine for writing in the MW-File state at the time of checkpointing

B. Communication Routines
Unlike MWPvmRC, the communication routines are non-trivial because Condor provides no inter-process comminucation. Thus we use files for communication. So a send is essentially a file write operation and a recv is a file-read operation. We maintain 2 lists:- The sendList and recvList for taking care of what is to be written/read to/from the files. As in pvm a user beings by calling initsend which creates a new list. Calls to pack insert into the list what is being packed. And finally a send writes the entire thing into a file identified by the destination. Corresponding things happpen in recv.

int initsend( int useless = 0 )
Initialize the send buffer

int send( int toWhom, int msgtag )
Send function

int recv( int fromWhom, int msgtag )
Recv function

int bufinfo( int buf_id, int *len, int *tag, int *sending_host )
Get some info about the recv buffer

int pack( char *bytes, int nitem, int stride = 1 )
pack some bytes

int pack( float *f, int nitem, int stride = 1 )
float

int pack( double *d, int nitem, int stride = 1 )
double

int pack( int *i, int nitem, int stride = 1 )
int

int pack( unsigned int *ui, int nitem, int stride = 1 )
unsigned int

int pack( short *sh, int nitem, int stride = 1 )
short

int pack( unsigned short *ush, int nitem, int stride = 1 )
unsigned short

int pack( long *l, int nitem, int stride = 1 )
long

int pack( unsigned long *ul, int nitem, int stride = 1 )
unsigned long

int pack( char *str )
string

int unpack( char *bytes, int nitem, int stride = 1 )
Unpack some bytes

int unpack( float *f, int nitem, int stride = 1 )
float

int unpack( double *d, int nitem, int stride = 1 )
double

int unpack( int *i, int nitem, int stride = 1 )
int

int unpack( unsigned int *ui, int nitem, int stride = 1 )
unsigned int

int unpack( short *sh, int nitem, int stride = 1 )
short

int unpack( unsigned short *ush, int nitem, int stride = 1 )
unsigned short

int unpack( long *l, int nitem, int stride = 1 )
long

int unpack( unsigned long *ul, int nitem, int stride = 1 )
unsigned long

int unpack( char *str )
string

Some private functions

int handle_finished_worker( int i )
Handle a message from the worker

int handle_resumed_worker( int i )
Is Called when a host is resumed

int handle_suspended_worker( int i )
Is Called when a host is suspended

int handle_killed_worker( int i )
Is Called when a task is dead

int handle_executing_worker( int i )
Called when the worker starts executing first

int handle_transited_worker( int i )
Called in no_checkpoint mode

int handle_master_executing()
Is called when the worker detects that the master has come up

int handle_work( int msgtag )
Is Called when a message is received by the worker

Polls the log files to see some changes

void CheckLogFilesRunning( )
Always get the last result

void CheckLogFilesResuscicate( )
Is Called when we are recovering from a crash

void resuscicate( )
The main resuscicate function

int GetWorkerCounter( char *file )
The helping routine for resuscicate

int GetCounter( char *file1, char *file2 )
Another helping routine

int GetMasterExpectedNumber( char *file )
Another helping routine

void GetCondorId( char *lgfile, int *cId, int *pId )
Find the condor_ID of the worker with this log file

int do_spawn( int numworkers, int arch )
Create a worker

void killWorker( int i )
Kill a worker

int setup_notifies( int worker_id )
Notify various events

int master_recv( int fromWhom, int msgtag )
Master receive

int worker_recv( int fromWhom, int msgtag )
Worker receive

bool IsComplete( int i )
Determine whether worker i has sent a message

int ChoseArchNum()
Get a architecture number

void inform_target_num_workers( )
Information by the upper layer

void InitStructures( )
A function that inits some internal structures

bool isMaster
The bool indicates the mode of the RC instance a true value means that it is a master and a false means that it is a worker. We need to make it a tristate as somtimes an RC can be both a master and a worker.

int FileRCID
The value is the id that the worker gets. For master it is of no use.

int expected_number
The expected number of the next message

int master_expected_number
The expected_number of the next message that the master is expecting

int actual_task
The actual task that is being executed

char output_directory[256]
The directory where all the workers have to send their output

char input_directory[256]
The directory where the master has to send the work

char control_directory[256]
The control directory the master reads for all resource management functions

char moment_worker_file[256]
The file in the control directory that will contain the momentary number of workers

char init_file[256]
The init file

int current_num_workers
The number of workers that were present. This is an internal variable that will be used for copying the entire thing when the target number changes. This is the number that will be maintained close to target_num_workers.

int submitted_num_workers
The number of submitted workers

int max_num_workers
An integer corresponding to the number of fileWorkers struct that we had/have as the maximum;

FileRCEvent* workerEvents
An array of message tags to be sent to the upper layers as to what tag has to be sent with the event associated with this worker

struct FileWorker *fileWorkers
An array of the number of workers

List* sendList
A list of all the items that are sent in a send after a series of packs

List* recvList
A list of all the items that are received

int cyclePosition
This is a variable that keeps the cycle in effect. The receive of ours goes in cycles to ensure fairness to the messages from all the slaves.

int CHECKLOG_FREQ
This determines with what frequency should we check the log files

int turnNo
This keeps the track of how many cycles were made

int msgTag
The tag of the message that just came in

int whomRecv
The message came from whom

int MasterUp
Internal variable that is init to what the worker has to send to the upper layer in case of a master wakeup

int* workerArch
A variable array helping in choosing the next arch to chose from

int subId
A variable to keep track of submit files

int* hostadd_reqs
A variable array keeping track of how many have been requested

int** hostaddind_reqs
A variable to keep track of which of each have been requested

long long checksum
The checksum calculated

long worker_timeout
The worker timeout in minutes


This class has no child classes.

alphabetic index hierarchy of classes


this page has been generated automatically by doc++

(c)opyright by Malte Zöckler, Roland Wunderling
contact: doc++@zib.de