Most generic dataset type. More...
Public Member Functions | |
void | AliasMatrix (const Matrix &matrix_in) |
Initializes as an alias or mirror of an existing matrix, assuming all features are continuous. | |
void | AliasMatrix (const Matrix &matrix_in) |
Initializes as an alias or mirror of an existing matrix, assuming all features are continuous. | |
void | CopyMatrix (const Matrix &matrix_in) |
Initializes from a matrix copying all contents, assuming all features are continuous. | |
void | CopyMatrix (const Matrix &matrix_in) |
Initializes from a matrix copying all contents, assuming all features are continuous. | |
void | Format (index_t feature, index_t point, String *result) const |
Formats as text a particular location of the data set. | |
void | Format (index_t feature, index_t point, String *result) const |
Formats as text a particular location of the data set. | |
double | get (index_t feature, index_t point) const |
Gets the numeric value of a particular feature and point. | |
double | get (index_t feature, index_t point) const |
Gets the numeric value of a particular feature and point. | |
int | get_int (index_t feature, index_t point) const |
Gets the integer value of a particular feature and point. | |
int | get_int (index_t feature, index_t point) const |
Gets the integer value of a particular feature and point. | |
void | GetLabels (ArrayList< double > &labels_list, ArrayList< index_t > &labels_index, ArrayList< index_t > &labels_ct, ArrayList< index_t > &labels_startpos) const |
Gets a list and indicies of labels in a labeled dataset. | |
void | GetLabels (ArrayList< double > &labels_list, ArrayList< index_t > &labels_index, ArrayList< index_t > &labels_ct, ArrayList< index_t > &labels_startpos) const |
Gets a list and indicies of labels in a labeled dataset. | |
DatasetInfo & | info () |
Metadata about the feature types and names for the dataset. | |
const DatasetInfo & | info () const |
Metadata about the feature types and names for the dataset. | |
DatasetInfo & | info () |
Metadata about the feature types and names for the dataset. | |
const DatasetInfo & | info () const |
Metadata about the feature types and names for the dataset. | |
void | InitBlank () |
Initializer that omits the matrix and info - you must initialize these yourself. | |
void | InitBlank () |
Initializer that omits the matrix and info - you must initialize these yourself. | |
success_t | InitFromFile (TextLineReader *reader, const char *filename="dataset") |
Reads in an ARFF or CSV/WSV file. | |
success_t | InitFromFile (const char *fname) |
Reads in an ARFF or CSV/WSV file. | |
success_t | InitFromFile (TextLineReader *reader, const char *filename="dataset") |
Reads in an ARFF or CSV/WSV file. | |
success_t | InitFromFile (const char *fname) |
Reads in an ARFF or CSV/WSV file. | |
Matrix & | matrix () |
Returns the matrix that stores all the data (can be used for modification). | |
const Matrix & | matrix () const |
Returns the matrix that stores all the data. | |
Matrix & | matrix () |
Returns the matrix that stores all the data (can be used for modification). | |
const Matrix & | matrix () const |
Returns the matrix that stores all the data. | |
index_t | n_features () const |
Gets the number of features/attributes the dataset has. | |
index_t | n_features () const |
Gets the number of features/attributes the dataset has. | |
index_t | n_labels () const |
Gets the number of labels in a labeled dataset. | |
index_t | n_labels () const |
Gets the number of labels in a labeled dataset. | |
index_t | n_points () const |
Gets the number of points/instances in the dataset. | |
index_t | n_points () const |
Gets the number of points/instances in the dataset. | |
void | OwnMatrix (Matrix *matrix_in) |
Initializes by becoming the owner of an existing matrix, assuming all features are continuous. | |
void | OwnMatrix (Matrix *matrix_in) |
Initializes by becoming the owner of an existing matrix, assuming all features are continuous. | |
double * | point (index_t point) |
Gets the "raw" form of a particular point. | |
const double * | point (index_t point) const |
Gets the "raw" form of a particular point. | |
double * | point (index_t point) |
Gets the "raw" form of a particular point. | |
const double * | point (index_t point) const |
Gets the "raw" form of a particular point. | |
void | set (index_t feature, index_t point, double d) |
Modifies a value in the dataset. | |
void | set (index_t feature, index_t point, double d) |
Modifies a value in the dataset. | |
void | SplitTrainTest (int folds, int fold_number, const ArrayList< index_t > &permutation, Dataset *train, Dataset *test) const |
Creates a training and test dataset for k-fold cross validation. | |
void | SplitTrainTest (int folds, int fold_number, const ArrayList< index_t > &permutation, Dataset *train, Dataset *test) const |
Creates a training and test dataset for k-fold cross validation. | |
success_t | WriteArff (const char *fname) const |
Writes to an ARFF file. | |
success_t | WriteArff (const char *fname) const |
Writes to an ARFF file. | |
success_t | WriteCsv (const char *fname, bool header=false) const |
Writes to a CSV file. | |
success_t | WriteCsv (const char *fname, bool header=false) const |
Writes to a CSV file. |
Most generic dataset type.
To allow polymorphic usage, everything is internally stored as doubles. If your data are discrete, the integer values are stored in the doubles.
(Implementation note: To allow efficient polymorphic use of mixed data, we found it is far more efficient to store these as doubles than to introduce any sort of casting or pointer arithmetic overhead. Space may be an issue, especially if this is Boolean data.)
Definition at line 380 of file dataset.h.
void Dataset::AliasMatrix | ( | const Matrix & | matrix_in | ) | [inline] |
Initializes as an alias or mirror of an existing matrix, assuming all features are continuous.
This does not copy the matrix but instead refers to an existing matrix, and changes in one will be reflected in the other. Make sure the other matrix does not fall out of scope and get freed!
matrix_in | data where rows are features, columns are points |
Definition at line 629 of file dataset.h.
References InitBlank(), and DatasetInfo::InitContinuous().
void Dataset::AliasMatrix | ( | const Matrix & | matrix_in | ) | [inline] |
Initializes as an alias or mirror of an existing matrix, assuming all features are continuous.
This does not copy the matrix but instead refers to an existing matrix, and changes in one will be reflected in the other. Make sure the other matrix does not fall out of scope and get freed!
matrix_in | data where rows are features, columns are points |
Definition at line 629 of file dataset.h.
References InitBlank(), and DatasetInfo::InitContinuous().
Referenced by data::Save().
void Dataset::CopyMatrix | ( | const Matrix & | matrix_in | ) | [inline] |
Initializes from a matrix copying all contents, assuming all features are continuous.
matrix_in | data where rows are features, columns are points |
Definition at line 597 of file dataset.h.
References InitBlank(), and DatasetInfo::InitContinuous().
void Dataset::CopyMatrix | ( | const Matrix & | matrix_in | ) | [inline] |
Initializes from a matrix copying all contents, assuming all features are continuous.
matrix_in | data where rows are features, columns are points |
Definition at line 597 of file dataset.h.
References InitBlank(), and DatasetInfo::InitContinuous().
void Dataset::Format | ( | index_t | feature, | |
index_t | point, | |||
String * | result | |||
) | const [inline] |
Formats as text a particular location of the data set.
feature | the feature number | |
point | the point index | |
result | string that will be initialized to the formatted text |
Definition at line 538 of file dataset.h.
References DatasetInfo::feature(), and DatasetFeature::Format().
void Dataset::Format | ( | index_t | feature, | |
index_t | point, | |||
String * | result | |||
) | const [inline] |
Formats as text a particular location of the data set.
feature | the feature number | |
point | the point index | |
result | string that will be initialized to the formatted text |
Definition at line 538 of file dataset.h.
References DatasetInfo::feature(), and DatasetFeature::Format().
double Dataset::get | ( | index_t | feature, | |
index_t | point | |||
) | const [inline] |
double Dataset::get | ( | index_t | feature, | |
index_t | point | |||
) | const [inline] |
int Dataset::get_int | ( | index_t | feature, | |
index_t | point | |||
) | const [inline] |
int Dataset::get_int | ( | index_t | feature, | |
index_t | point | |||
) | const [inline] |
void Dataset::GetLabels | ( | ArrayList< double > & | labels_list, | |
ArrayList< index_t > & | labels_index, | |||
ArrayList< index_t > & | labels_ct, | |||
ArrayList< index_t > & | labels_startpos | |||
) | const |
Gets a list and indicies of labels in a labeled dataset.
The list corresponds to the different items of the n_features-th row (last row) in the matrix. The indices is arranged after grouping labels of the same class, i.e. (class_1 class_2...class_k), each item indicate the position of the label in the dataset.
labels_list | a list of labels in the dataset. e.g. [0.0,1.0,2.0] for a 3-class dataset | |
labels_index | the label indices of each data point. e.g. [(c1)[0,5,6,7,10,13,17], (c2)[1,2,4,8,9], (c3)[3,11,12,14,15,16,18,19]] | |
labels_ct | numbers of point in each label class. e.g. [7,5,8] | |
labels_startpos | start positions of each label class in labels_index. e.g. [0,7,12] |
void Dataset::GetLabels | ( | ArrayList< double > & | labels_list, | |
ArrayList< index_t > & | labels_index, | |||
ArrayList< index_t > & | labels_ct, | |||
ArrayList< index_t > & | labels_startpos | |||
) | const |
Gets a list and indicies of labels in a labeled dataset.
The list corresponds to the different items of the n_features-th row (last row) in the matrix. The indices is arranged after grouping labels of the same class, i.e. (class_1 class_2...class_k), each item indicate the position of the label in the dataset.
labels_list | a list of labels in the dataset. e.g. [0.0,1.0,2.0] for a 3-class dataset | |
labels_index | the label indices of each data point. e.g. [(c1)[0,5,6,7,10,13,17], (c2)[1,2,4,8,9], (c3)[3,11,12,14,15,16,18,19]] | |
labels_ct | numbers of point in each label class. e.g. [7,5,8] | |
labels_startpos | start positions of each label class in labels_index. e.g. [0,7,12] |
Definition at line 327 of file dataset.cc.
References ArrayList< TElem >::Clear(), ArrayList< TElem >::Init(), n_labels(), n_points(), and ArrayList< TElem >::PushBack().
Referenced by GeneralCrossValidator< TLearner >::Run().
DatasetInfo& Dataset::info | ( | ) | [inline] |
const DatasetInfo& Dataset::info | ( | ) | const [inline] |
Metadata about the feature types and names for the dataset.
Each dataset has a name, and each feature is described by a type (continuous, etc) and a name, modelled loosely over the ARFF format. See the DatasetInfo documentation for more information.
DatasetInfo& Dataset::info | ( | ) | [inline] |
const DatasetInfo& Dataset::info | ( | ) | const [inline] |
Metadata about the feature types and names for the dataset.
Each dataset has a name, and each feature is described by a type (continuous, etc) and a name, modelled loosely over the ARFF format. See the DatasetInfo documentation for more information.
Definition at line 400 of file dataset.h.
Referenced by SimpleCrossValidator< TClassifier >::Init(), and SplitTrainTest().
void Dataset::InitBlank | ( | ) | [inline] |
void Dataset::InitBlank | ( | ) | [inline] |
Initializer that omits the matrix and info - you must initialize these yourself.
(Although this currently does nothing, this is to future-proof your code against possible changes.)
Definition at line 549 of file dataset.h.
Referenced by AliasMatrix(), CopyMatrix(), OwnMatrix(), and SplitTrainTest().
success_t Dataset::InitFromFile | ( | TextLineReader * | reader, | |
const char * | filename = "dataset" | |||
) |
Reads in an ARFF or CSV/WSV file.
ARFF LIMITATIONS: Values cannot have spaces or commas, even with quotes; 'string' data type not supported (nominal is supported).
reader | a line reader opened on a CSV or WSV or ARFF file | |
filename | a title given to this data set, doesn't necessarily need to be anything significant |
success_t Dataset::InitFromFile | ( | const char * | fname | ) |
Reads in an ARFF or CSV/WSV file.
ARFF LIMITATIONS: Values cannot have spaces or commas, even with quotes; 'string' data type not supported (nominal is supported).
fname | the name of an ARFF, CSV, or whitespace-separated |
success_t Dataset::InitFromFile | ( | TextLineReader * | reader, | |
const char * | filename = "dataset" | |||
) |
Reads in an ARFF or CSV/WSV file.
ARFF LIMITATIONS: Values cannot have spaces or commas, even with quotes; 'string' data type not supported (nominal is supported).
reader | a line reader opened on a CSV or WSV or ARFF file | |
filename | a title given to this data set, doesn't necessarily need to be anything significant |
Definition at line 589 of file dataset.cc.
References DatasetInfo::InitFromFile(), and DatasetInfo::ReadMatrix().
success_t Dataset::InitFromFile | ( | const char * | fname | ) |
Reads in an ARFF or CSV/WSV file.
ARFF LIMITATIONS: Values cannot have spaces or commas, even with quotes; 'string' data type not supported (nominal is supported).
fname | the name of an ARFF, CSV, or whitespace-separated |
Definition at line 576 of file dataset.cc.
References DatasetInfo::Init(), and TextLineReader::Open().
Referenced by data::Load(), and LoadData().
Matrix& Dataset::matrix | ( | ) | [inline] |
const Matrix& Dataset::matrix | ( | ) | const [inline] |
Matrix& Dataset::matrix | ( | ) | [inline] |
const Matrix& Dataset::matrix | ( | ) | const [inline] |
Returns the matrix that stores all the data.
Definition at line 520 of file dataset.h.
Referenced by SVM< TKernel >::BatchPredict(), DoSvmNormalize(), data::Load(), GeneralCrossValidator< TLearner >::Run(), SimpleCrossValidator< TClassifier >::Run(), and SplitTrainTest().
index_t Dataset::n_features | ( | ) | const [inline] |
index_t Dataset::n_features | ( | ) | const [inline] |
Gets the number of features/attributes the dataset has.
This corresponds to the number of rows in the matrix.
Definition at line 419 of file dataset.h.
Referenced by SVM< TKernel >::BatchPredict(), DoSvmNormalize(), SVM< TKernel >::Init(), SimpleCrossValidator< TClassifier >::Init(), GeneralCrossValidator< TLearner >::Run(), SimpleCrossValidator< TClassifier >::Run(), and SplitTrainTest().
index_t Dataset::n_labels | ( | ) | const |
Gets the number of labels in a labeled dataset.
This corresponds to the number of different items of the n_features-th row (last row) in the matrix.
index_t Dataset::n_labels | ( | ) | const |
Gets the number of labels in a labeled dataset.
This corresponds to the number of different items of the n_features-th row (last row) in the matrix.
Definition at line 298 of file dataset.cc.
References ArrayList< TElem >::Clear(), ArrayList< TElem >::Init(), and ArrayList< TElem >::PushBack().
Referenced by GetLabels(), SVM< TKernel >::Init(), GeneralCrossValidator< TLearner >::Init(), and GeneralCrossValidator< TLearner >::Run().
index_t Dataset::n_points | ( | ) | const [inline] |
index_t Dataset::n_points | ( | ) | const [inline] |
Gets the number of points/instances in the dataset.
This corresponds to the number of columns in the matrix.
Definition at line 430 of file dataset.h.
Referenced by SVM< TKernel >::BatchPredict(), GeneralCrossValidator< TLearner >::clsf_n_incorrect(), GeneralCrossValidator< TLearner >::clsf_portion_correct(), DoSvmNormalize(), GetLabels(), SVM< TKernel >::Init(), SimpleCrossValidator< TClassifier >::n_incorrect(), SimpleCrossValidator< TClassifier >::portion_correct(), GeneralCrossValidator< TLearner >::Run(), SimpleCrossValidator< TClassifier >::Run(), and SplitTrainTest().
void Dataset::OwnMatrix | ( | Matrix * | matrix_in | ) | [inline] |
Initializes by becoming the owner of an existing matrix, assuming all features are continuous.
By becoming the owner of the matrix, it means that the matrix will be freed when the Dataset falls out of scope. See Matrix class for details about the Own function.
matrix_in | data where rows are features, columns are points |
Definition at line 613 of file dataset.h.
References InitBlank(), and DatasetInfo::InitContinuous().
void Dataset::OwnMatrix | ( | Matrix * | matrix_in | ) | [inline] |
Initializes by becoming the owner of an existing matrix, assuming all features are continuous.
By becoming the owner of the matrix, it means that the matrix will be freed when the Dataset falls out of scope. See Matrix class for details about the Own function.
matrix_in | data where rows are features, columns are points |
Definition at line 613 of file dataset.h.
References InitBlank(), and DatasetInfo::InitContinuous().
Referenced by GenerateArtificialDataset().
double* Dataset::point | ( | index_t | point | ) | [inline] |
const double* Dataset::point | ( | index_t | point | ) | const [inline] |
double* Dataset::point | ( | index_t | point | ) | [inline] |
const double* Dataset::point | ( | index_t | point | ) | const [inline] |
void Dataset::set | ( | index_t | feature, | |
index_t | point, | |||
double | d | |||
) | [inline] |
void Dataset::set | ( | index_t | feature, | |
index_t | point, | |||
double | d | |||
) | [inline] |
void Dataset::SplitTrainTest | ( | int | folds, | |
int | fold_number, | |||
const ArrayList< index_t > & | permutation, | |||
Dataset * | train, | |||
Dataset * | test | |||
) | const |
Creates a training and test dataset for k-fold cross validation.
The test set will be approximately n_points() / folds, and the training set will be all remaining points. This takes as an argument a permutation to allow use of consistent random permutations. If an identity permutation is used, the split will be performed strided.
folds | the number of folds being used | |
fold_number | the fold number, 0 to folds - 1 | |
permutation | the permutation to use, the same size as n_points() (use math::MakeIdentiyPermutation or math::MakeRandomPermutation) | |
train | the training set | |
test | the test set |
void Dataset::SplitTrainTest | ( | int | folds, | |
int | fold_number, | |||
const ArrayList< index_t > & | permutation, | |||
Dataset * | train, | |||
Dataset * | test | |||
) | const |
Creates a training and test dataset for k-fold cross validation.
The test set will be approximately n_points() / folds, and the training set will be all remaining points. This takes as an argument a permutation to allow use of consistent random permutations. If an identity permutation is used, the split will be performed strided.
folds | the number of folds being used | |
fold_number | the fold number, 0 to folds - 1 | |
permutation | the permutation to use, the same size as n_points() (use math::MakeIdentiyPermutation or math::MakeRandomPermutation) | |
train | the training set | |
test | the test set |
Definition at line 632 of file dataset.cc.
References mem::Copy(), info(), InitBlank(), matrix(), n_features(), and n_points().
Referenced by GeneralCrossValidator< TLearner >::Run(), and SimpleCrossValidator< TClassifier >::Run().
success_t Dataset::WriteArff | ( | const char * | fname | ) | const |
Writes to an ARFF file.
fname | name of the file |
success_t Dataset::WriteArff | ( | const char * | fname | ) | const |
Writes to an ARFF file.
fname | name of the file |
Definition at line 619 of file dataset.cc.
References TextWriter::Close(), TextWriter::Open(), DatasetInfo::WriteArffHeader(), and DatasetInfo::WriteMatrix().
success_t Dataset::WriteCsv | ( | const char * | fname, | |
bool | header = false | |||
) | const |
Writes to a CSV file.
fname | name of the file | |
header | whether to include a first line which is the titles of the data |
success_t Dataset::WriteCsv | ( | const char * | fname, | |
bool | header = false | |||
) | const |
Writes to a CSV file.
fname | name of the file | |
header | whether to include a first line which is the titles of the data |
Definition at line 604 of file dataset.cc.
References TextWriter::Close(), TextWriter::Open(), DatasetInfo::WriteCsvHeader(), and DatasetInfo::WriteMatrix().
Referenced by DoSvmNormalize(), and data::Save().