BALL
1.4.1
|
00001 /* QSARData.h 00002 * 00003 * Copyright (C) 2009 Marcel Schumann 00004 * 00005 * This file is part of QuEasy -- A Toolbox for Automated QSAR Model 00006 * Construction and Validation. 00007 * QuEasy is free software; you can redistribute it and/or modify 00008 * it under the terms of the GNU General Public License as published by 00009 * the Free Software Foundation; either version 3 of the License, or (at 00010 * your option) any later version. 00011 * 00012 * QuEasy is distributed in the hope that it will be useful, but 00013 * WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU General Public License 00018 * along with this program; if not, see <http://www.gnu.org/licenses/>. 00019 */ 00020 00021 // -*- Mode: C++; tab-width: 2; -*- 00022 // vi: set ts=2: 00023 // 00024 // 00025 00026 #ifndef QSARH 00027 #define QSARH 00028 00029 #include <iostream> 00030 #include <BALL/KERNEL/system.h> 00031 #include <BALL/FORMAT/SDFile.h> 00032 #include <BALL/FORMAT/PDBFile.h> 00033 #include <BALL/FORMAT/HINFile.h> 00034 #include <BALL/FORMAT/MOLFile.h> 00035 #include <vector> 00036 #include <list> 00037 #include <set> 00038 #include <map> 00039 #include <math.h> 00040 #include <sstream> 00041 #include <fstream> 00042 #include <limits> 00043 #include <fstream> 00044 #include <BALL/QSAR/simpleDescriptors.h> 00045 #include <BALL/QSAR/connectivityDescriptors.h> 00046 #include <BALL/QSAR/partialChargeDescriptors.h> 00047 #include <BALL/QSAR/surfaceDescriptors.h> 00048 #include <BALL/COMMON/exception.h> 00049 #include <string.h> 00050 00051 #ifndef STATISTICS 00052 #include <BALL/QSAR/statistics.h> 00053 #endif 00054 00055 #ifndef QSAR_EXCEPTION 00056 #include <BALL/QSAR/exception.h> 00057 #endif 00058 00059 #include <gsl/gsl_randist.h> 00060 #include <gsl/gsl_cdf.h> 00061 00062 #include <BALL/CONCEPT/timeStamp.h> 00063 00064 // #ifndef MODEL 00065 // #include "Model.h" 00066 // #endif 00067 00068 namespace BALL 00069 { 00070 namespace QSAR 00071 { 00072 typedef vector<double> Column; 00073 typedef vector<Column> VMatrix; 00074 00076 class BALL_EXPORT QSARData 00077 { 00078 public: 00079 00080 QSARData(); 00081 00082 ~QSARData(); 00083 00087 bool isDataCentered() const; 00088 00090 bool isResponseCentered() const; 00091 00096 vector<String>* readPropertyNames(String sd_file); 00097 00101 void readSDFile(const char* file); 00102 00108 void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0); 00109 00113 void calculateBALLDescriptors(Molecule& m); 00114 00116 void displayMatrix(); 00117 00120 void centerData(bool center_Y=0); 00121 00123 void scaleAllDescriptors(); 00124 00126 unsigned int getNoSubstances() const; 00127 00129 unsigned int getNoDescriptors() const; 00130 00138 void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0); 00139 00141 void manipulateY(vector<String> v); 00142 00145 void manipulateY(String v); 00146 00149 void discretizeY(vector<double> thresholds); 00150 00151 void transformX(vector<String> v); 00152 00154 vector<QSARData*> partitionInputData(int p); 00155 00157 void saveToFile(string filename) const; 00158 00160 void readFromFile(string filename); 00161 00164 vector<QSARData*> generateExternalSet(double fraction) const; 00165 00170 vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const; 00171 00173 vector<double>* getSubstance(int s) const; 00174 00176 vector<double>* getActivity(int s) const; 00177 00179 unsigned int getNoResponseVariables() const; 00180 00181 const vector<string>* getSubstanceNames() const; 00182 00184 bool checkforDiscreteY() const; 00185 00186 00188 bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const; 00189 00191 void setDataFolder(const char* folder); 00192 00195 void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold); 00196 00202 void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const; 00204 00205 00206 protected: 00207 00212 void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1); 00213 00216 void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors); 00217 00218 void removeInvalidSubstances(std::multiset<int>& inv); 00219 00221 void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col); 00222 00225 void checkActivityIDs(std::multiset<int>& act, int no_properties); 00226 00229 void insertSubstance(const QSARData* source, int s, bool backtransformation=0); 00230 00232 void printMatrix(const VMatrix& mat, std::ostream& out) const; 00234 00239 VMatrix descriptor_matrix_; 00240 00242 VMatrix Y_; 00243 00245 VMatrix descriptor_transformations_; 00246 00248 VMatrix y_transformations_; 00249 00251 vector<string> column_names_; 00252 00254 vector<string> substance_names_; 00255 00257 std::multiset<int> invalidDescriptors_; 00258 00259 std::multiset<int> invalidSubstances_; 00260 00261 String data_folder_; 00262 00264 std::map<String,int> class_names_; 00266 00267 00268 00269 friend class ClassificationValidation; 00270 friend class RegressionValidation; 00271 friend class Validation; 00272 friend class Model; 00273 friend class FitModel; 00274 friend class FeatureSelection; 00275 00276 }; 00277 00278 } 00279 } 00280 00281 #endif // QSARH