Main Page   Modules   Namespace List   Class Hierarchy   Data Structures   File List   Data Fields   Globals  

Mafia.good.cpp

Go to the documentation of this file.
00001 /*
00002 
00003 Copyright (c) 2003, Cornell University
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008 
00009   - Redistributions of source code must retain the above copyright notice,
00010       this list of conditions and the following disclaimer.
00011   - Redistributions in binary form must reproduce the above copyright
00012       notice, this list of conditions and the following disclaimer in the
00013       documentation and/or other materials provided with the distribution.
00014   - Neither the name of Cornell University nor the names of its
00015       contributors may be used to endorse or promote products derived from
00016       this software without specific prior written permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029 
00030 */
00031 
00032 /////////////////////////////////////////////////////////////////////
00033 //
00034 // Mafia.cpp
00035 //
00036 /////////////////////////////////////////////////////////////////////
00037 
00038 #define DEBUG
00039 
00040 #include <iostream>
00041 #include <fstream>
00042 #include <time.h>
00043 #include <stdio.h>
00044 #include <list>
00045 #include <vector>
00046 #include <algorithm>
00047 #include <string>
00048 #include <assert.h>
00049 #include <math.h>
00050 #include <map>
00051 #include "Bitmap.h"
00052 #include "TreeNode.h"
00053 #include "Transaction.h"
00054 #include "ItemsetOutput.h"
00055 
00056 using namespace std;
00057 
00058 /// @defgroup GlobalVariables Global Variables
00059 /// Global variables
00060 /** @{ */
00061 typedef vector<TreeNode *> NodeList;
00062 typedef vector<TreeNode *> BranchList;
00063 typedef vector<Bitmap *> BitmapList;
00064 typedef vector<BaseBitmap *> BaseBitmapList;
00065 typedef vector<int> ItemSet;
00066 typedef map<long, ItemSet *> HashTable;
00067 
00068 /// Simple class for storing subtree size estimates
00069 class SubtreeEstimate {
00070 public:
00071     int Count;              ///< Count of actual subtrees counted
00072     int Sum;                ///< Sum of subtree sizes
00073     SubtreeEstimate () {
00074         Count = 0;
00075         Sum = 0;
00076     }
00077 };
00078 
00079 /// Simple class for storing tail elements of each node of the tree
00080 class TailElement {
00081 public:
00082     int Count;              ///< Support of the 1-extension
00083     int Item;               ///< Item-id for this1 -extension
00084     
00085     TailElement () {
00086         Count = 0;
00087         Item = 0;
00088     }
00089     
00090     bool operator < (const TailElement& rhs) const {
00091         return this->Count < rhs.Count;
00092     };
00093 };
00094 
00095 /// @defgroup CommandLineParameters Command Line Parameters/Variables
00096 /// Commmand line parameters from user or inferred
00097 /** @{ */
00098 string method;               ///< either -mfi or -fci or -fi
00099 char* outFilename;           ///< filename for output
00100 ItemsetOutput *outFile;      ///< file for ouput
00101 bool outputMFI = false;      ///< true if MFI should be saved to a file
00102 bool MethodIsFI = false;     ///< true if the method is -fi
00103 bool MethodIsFCI = false;    ///< true if the method is -fci
00104 int ItemCount;               ///< # of items in the file
00105 int TransCount;              ///< # of transactions in the file
00106 double MSF;                  ///< user-defined min sup as percentage
00107 int MS;                      ///< min sup as a transaction count
00108 int VerScale = 1;            ///< Scaling factor for transactions
00109 int HorScale = 1;            ///< Scaling factor for items
00110 bool GoFHUT = true;          ///< FHUT flag
00111 bool HUTMFI = true;          ///< HUTMFI flag
00112 bool PEPrune = true;         ///< PEPrune flag -- parent equivalent pruning
00113 bool Reorder = true;         ///< Reorder flag
00114 /** @} */
00115 
00116 
00117 /// @defgroup CounterVariables Counter Variables
00118 /// Counter variables for information gathering
00119 /** @{ */
00120 int CountFHUT = 0;           ///< # of times FHUT was successful
00121 int CountNodes = 0;          ///< # of frequent nodes in the tree
00122 int CountCounts = 0;         ///< # of Counts or all nodes in the tree
00123 int CountAnds = 0;           ///< # of ANDs of normal bitmaps
00124 int CountSmallAnds = 0;      ///< # of compressed bitmap ANDs
00125 int CountPEPrunes = 0;         ///< # of PEPruning
00126 int CountCheckPosition = 0;  ///< # of CheckPosition calls
00127 int CountHUTMFI = 0;         ///< # of HUTMFI attempts
00128 int CountHUTMFISuccess = 0;  ///< # of HUTMFI successes
00129 int CountRebuilds;           ///< # of Rebuilds
00130 /** @} */
00131 
00132 
00133 /// @defgroup ProgramVariables Program Parameters/Variables
00134 /// Useful program parameters/counters
00135 /** @{ */
00136 int maxtail = 0;
00137 int MFISize = 0;             ///< MFI size before pruning
00138 int MFIDepth = 0;            ///< The aggregated depth of the all MFI elements
00139 int F1size = 0;              ///< # of frequent 1-itemsets after merging repeats
00140 int FullF1size = 0;          ///< # of frequent 1-itemsets
00141 int k = 50;                  ///< # of items checked for a MFI lookup
00142 int MAX_compID = 1;          ///< max compression ID
00143 int projectDepth = -1;       ///< depth of the bitmap you're projecting from
00144 int EstimateSize;            ///< size of subtree estimation buffer
00145 int EstimateDiv = 5;         ///< bucket size by frequent tail length
00146 int maxItemsetSize = 0;      ///< max size of a frequent itemset
00147 /** @} */
00148 
00149 
00150 /// @defgroup DataVariables Data variables
00151 /// Complex data structure variables
00152 /** @{ */
00153 NodeList F1;                 ///< List of frequent 1-itemsets
00154 BitmapList TransBuffy;       ///< Buffer of transaction bitmaps
00155 BaseBitmapList NameBuffy;    ///< Buffer of name bitmaps
00156 NodeList NodeBuffy;          ///< Buffer of tree nodess
00157 TreeNode *Root;              ///< The root (the nullset)
00158 TailElement *gTail;          ///< global tail pointer
00159 TailElement *TailBuffy;      ///< Common Buffer for tail elements
00160 Bitmap *NullTrans;           ///< A transaction bitmap filled with ones
00161 int *ItemMap;                ///< For renaming items after sorting by support
00162 int *ReverseItemMap;         ///< For remembering the renaming...
00163 BaseBitmapList MFI;          ///< List of Maximally Frequent Itemsets
00164 HashTable HT;                ///< Hash table of transaction supports
00165 vector <int> SupportCountList;       ///< List that stores support count
00166 BaseBitmap* TempName;        ///< Temporary buffer for one name bitmap
00167 SubtreeEstimate* EstimateBuffy;      ///< Buffer of subtree estimates
00168 int *MFIBySizes;             ///< Buffer for counting MFI by itemset size
00169 int *ItemsetBuffy;           ///< Buffer for writing itemsets to file output
00170 /** @} */
00171 
00172 
00173 /// @defgroup TimingVariables Timing Variables
00174 /// Variables for timing (and instrumenting the code)
00175 /** @{ */
00176 time_t total_start, total_finish;
00177 double total_time;
00178 time_t read_start, read_finish;
00179 double read_time;
00180 time_t algorithm_start, algorithm_finish;
00181 double algorithm_time;
00182 time_t print_start, print_finish;
00183 double print_time;
00184 /** @} */
00185 /** @} */
00186 
00187 /*********************************************************************
00188  Reading the data in and building the item bitmaps
00189 *********************************************************************/ 
00190 /// @defgroup InputOutput Input/Output Functions
00191 /// Reading the data in and building the item bitmaps.
00192 /// Outputting the frequent itemsets.
00193 /** @{ */
00194 
00195 /////////////////////////////////////////////////////////////////////
00196 /// Insert pointer to node in order of increasing support
00197 ///
00198 /// @param newNode           item node to add to F1
00199 /////////////////////////////////////////////////////////////////////
00200 void AddToF1(TreeNode *newNode) {
00201     if (F1.empty())
00202         F1.push_back(newNode);
00203     else {
00204         // use insertion sort for increasing support ordering
00205         NodeList::iterator noli = F1.begin();
00206         while (noli != F1.end()) {
00207             if ((*noli)->Trans->_count >= newNode->Trans->_count) {
00208                 F1.insert(noli, newNode);
00209                 return ;
00210             }
00211             noli++;
00212         }
00213 
00214         // Add to end of F1 list
00215         F1.push_back(newNode);
00216     }
00217 }
00218 
00219 /////////////////////////////////////////////////////////////////////
00220 /// Create bitmaps filled randomly with probability p
00221 ///
00222 /// @param P - probability of a bit p being set
00223 /////////////////////////////////////////////////////////////////////
00224 void F1UsingProb(double P) {
00225     int blah = 0;
00226     //cout << "Creating bitmap with prob p=" << P << endl;
00227 
00228     // Create frequent 1-itemsets with probability p
00229     for (int i = 0; i < ItemCount / HorScale; i++) {
00230         // Create random transaction bitmap
00231         Bitmap *trans = new Bitmap(TransCount);
00232         trans->FillRand(P);
00233 
00234         // Add frequent items to list
00235         if (trans->Count(blah) >= MS) {
00236             TreeNode *node = new TreeNode(NULL, trans, 1, -1, i, -1, 0);
00237             for (int r = 0; r < HorScale; r++)
00238                 AddToF1(node);
00239         } else
00240             delete trans;
00241     }
00242 }
00243 
00244 /////////////////////////////////////////////////////////////////////
00245 /// Read transaction data from file and build item bitmaps.
00246 ///     If the data is in ascii form:
00247 ///     [itemid_1] [itemid_2] ... [itemid_n]
00248 ///     If the data is in binary form:
00249 ///     [custid] [transid] [number of items] [itemid_1] [itemid_2] ... [itemid_n]
00250 ///
00251 /// @param filename          name of file to be read in
00252 /// @param isAsciiFile       true for ASCII input, false for binary
00253 /////////////////////////////////////////////////////////////////////
00254 void F1FromFile(char *filename, bool isAsciiFile) {
00255     TransCount = 0;
00256     int MAXitemID = -1;
00257     int *Counters = new int[MAX_NUM_ITEMS];        //to count the support of items
00258     int *F1IndexList = new int[MAX_NUM_ITEMS];     // the id's of F1 items
00259     int *InvF1IndexList = new int[MAX_NUM_ITEMS];
00260     int itemIndex = 0;
00261 
00262     //Initialize counters
00263     for (int ct = 0; ct < MAX_NUM_ITEMS; ++ct) {
00264         Counters[ct] = 0;
00265         F1IndexList[ct] = -1;
00266         InvF1IndexList[ct] = -1;
00267     }
00268 
00269     time(&read_start);
00270     BitmapList Trans;
00271 
00272     int *itemlist = new int[MAX_NUM_ITEMS];
00273     InputData *inData = new InputData(filename, itemlist, isAsciiFile);
00274     if (!inData->isOpen()) {
00275         cerr << "Input file not found!" << endl;
00276         exit(1);
00277     }
00278 
00279     Transaction *newTransaction = inData->getNextTransaction();
00280     while (newTransaction != NULL) {
00281         for (itemIndex = 0; itemIndex < newTransaction->length; itemIndex++) {
00282             // ensure that there are not too many items
00283             if (newTransaction->itemlist[itemIndex] >= MAX_NUM_ITEMS) {
00284                 cerr << "Read item_id=" << newTransaction->itemlist[itemIndex]
00285                 << " which is more than the max item_id allowed (" << MAX_NUM_ITEMS << ")";
00286                 exit(1);
00287             }
00288 
00289             if (newTransaction->itemlist[itemIndex] > MAXitemID)
00290                 MAXitemID = newTransaction->itemlist[itemIndex];
00291 
00292             Counters[newTransaction->itemlist[itemIndex]]++;
00293         }
00294 
00295         TransCount++;
00296         delete newTransaction;
00297         newTransaction = inData->getNextTransaction();
00298     }
00299 
00300     delete newTransaction;
00301     delete inData;
00302 
00303     MS = (int)ceil(MSF * (double)TransCount);
00304     //MSF = MS/(double)TransCount;
00305 
00306     ItemCount = MAXitemID + 1;
00307 
00308     int F1items = 0;
00309     // build the normal bitmaps -- Preallocated memory for the bitmaps
00310     for (itemIndex = 0; itemIndex <= MAXitemID; itemIndex++) {
00311         if (Counters[itemIndex] >= MS) {
00312             F1IndexList[F1items++] = itemIndex;
00313             InvF1IndexList[itemIndex] = F1items - 1;
00314             Bitmap *trans = new Bitmap(TransCount);
00315             trans->_count = Counters[itemIndex];
00316             Trans.push_back(trans);
00317         }
00318     }
00319 
00320     int transIndex = 0;
00321     InputData *inData2 = new InputData(filename, itemlist, isAsciiFile);
00322     newTransaction = inData2->getNextTransaction();
00323     while (newTransaction != NULL) {
00324         for (int itemIndex = 0; itemIndex < newTransaction->length; itemIndex++) {
00325             if (InvF1IndexList[newTransaction->itemlist[itemIndex]] != -1) {
00326                 Trans[InvF1IndexList[newTransaction->itemlist[itemIndex]]]->FillEmptyPosition(transIndex);
00327             }
00328         }
00329 
00330         transIndex++;
00331         delete newTransaction;
00332         newTransaction = inData2->getNextTransaction();
00333     }
00334 
00335     delete newTransaction;
00336     delete inData2;
00337     delete [] itemlist;
00338 
00339     time(&read_finish);
00340     read_time = difftime(read_finish, read_start);
00341     printf("Reading input time: %.2f seconds.\n", read_time);
00342 
00343     // Create F1
00344     BitmapList::iterator bli = Trans.begin();
00345     itemIndex = 0;
00346 
00347     while (bli != Trans.end()) {
00348         TreeNode *node = new TreeNode(
00349                              NULL,
00350                              (*bli),
00351                              1,
00352                              -1,
00353                              F1IndexList[itemIndex],
00354                              -1,
00355                              0);
00356         AddToF1(node);
00357         bli++;
00358         itemIndex++;
00359     }
00360 
00361     delete [] Counters;
00362     delete [] F1IndexList;
00363     delete [] InvF1IndexList;
00364 }
00365 
00366 /////////////////////////////////////////////////////////////////////
00367 /// To print the MFI data out to an ASCII file
00368 ///     with each entry having the format:
00369 ///     [list of items in MFI entry...] [(support)]
00370 /////////////////////////////////////////////////////////////////////
00371 void PrintMFI() {
00372     // open output file
00373     outFile = new ItemsetOutput(outFilename);
00374     if (!outFile->isOpen()) {
00375         cerr << "Output file not open!" << endl;
00376         exit(1);
00377     }
00378 
00379     if (FullF1size != 0) {
00380         // translate bitmap to list of INTs
00381         int* ITEMS = new int[FullF1size];
00382         for (int i = 0; i < MFISize; i++) {
00383             int j = 0;
00384             for (int cc = 0; cc < FullF1size; cc++) {
00385                 if (MFI[i]->CheckPosition(cc, CountCheckPosition) > 0) {
00386                     ITEMS[j] = ItemMap[cc];
00387                     j++;
00388                 }
00389             }
00390 
00391             outFile->printSet(MFI[i]->_count, ITEMS, SupportCountList[i]);
00392         }
00393         delete [] ITEMS;
00394     } else {
00395         outFile->printSet(0, NULL, TransCount);
00396     }
00397 
00398     delete outFile;
00399 }
00400 
00401 void cerrMFI() {
00402     if (FullF1size != 0) {
00403         // translate bitmap to list of INTs
00404         int* ITEMS = new int[FullF1size];
00405         for (int i = 0; i < MFISize; i++) {
00406             int j = 0;
00407             cerr << "MFI i=" << i << ":";
00408             for (int cc = 0; cc < FullF1size; cc++) {
00409                 if (MFI[i]->CheckPosition(cc, CountCheckPosition) > 0) {
00410                     cerr << " " << cc;
00411                     j++;
00412                 }
00413             }
00414 
00415             cerr << endl;
00416         }
00417         delete [] ITEMS;
00418     }
00419 }
00420 
00421 void PrintItemset(BaseBitmap *printSet) {
00422     if (FullF1size != 0) {
00423         for (int cc = 0; cc < FullF1size; cc++) {
00424             if (printSet->CheckPosition(cc, CountCheckPosition) > 0) {
00425                 cerr << cc << " ";
00426             }
00427         }
00428         cerr << endl;
00429     }
00430 }
00431 /** @} */
00432 
00433 
00434 /*********************************************************************
00435  Algorithmic components
00436 *********************************************************************/ 
00437 /// @defgroup AlgorithmicComponents Algorithmic Component Functions
00438 /// Algorithm components (HUTMFI, PEP, etc.)
00439 /** @{ */
00440 
00441 /////////////////////////////////////////////////////////////////////
00442 /// Check for an existing superset of name in the MFI
00443 ///
00444 /// @param location          The node we're at.  Use this to examine only
00445 ///                          the relevant portion of the MFI
00446 /// @return True - if superset found.
00447 ///         False - if no superset.
00448 /////////////////////////////////////////////////////////////////////
00449 bool LMFISuperSet(TreeNode* location) {
00450     return (location->rEnd > location->rBegin);
00451 }
00452 
00453 /////////////////////////////////////////////////////////////////////
00454 /// Output itemset (don't need to save bitmaps for FI)
00455 ///
00456 /// @param C                 the current node
00457 /////////////////////////////////////////////////////////////////////
00458 void AddToFI(TreeNode *C) {
00459     int itemsetIndex = 0;
00460     for (int cc = 0; cc < FullF1size; cc++) {
00461         if (C->Name->CheckPosition(cc, CountCheckPosition) > 0) {
00462             ItemsetBuffy[itemsetIndex] = ItemMap[cc];
00463             itemsetIndex++;
00464         }
00465     }
00466 
00467     MFIBySizes[C->Name->_count]++;
00468     if (C->Name->_count > maxItemsetSize)
00469         maxItemsetSize = C->Name->_count;
00470 
00471     if (outputMFI)
00472         outFile->printSet(C->Name->_count, ItemsetBuffy, C->Trans->_count);
00473 
00474     // Update stat variables
00475     MFIDepth += C->Name->_count;
00476     MFISize++;
00477 }
00478 
00479 /////////////////////////////////////////////////////////////////////
00480 /// Add this node's name bitmap to the MFI list
00481 ///
00482 /// @param C                 the current node
00483 /////////////////////////////////////////////////////////////////////
00484 void AddToMFI(TreeNode *C) {
00485     // copy name bitmap
00486     BaseBitmap *name = new BaseBitmap(*C->Name);
00487 
00488     // add to MFI
00489     MFI.push_back(name);
00490 
00491     // update the end of the relevant
00492     C->rEnd++;
00493 
00494     // Update stat variables
00495     MFIDepth += C->Name->_count;
00496     MFISize++;
00497 
00498     SupportCountList.push_back(C->Trans->_count);
00499     MFIBySizes[name->_count]++;
00500     if (name->_count > maxItemsetSize)
00501         maxItemsetSize = name->_count;
00502 }
00503 
00504 /////////////////////////////////////////////////////////////////////
00505 /// Add this node's name bitmap to the FCI list
00506 ///
00507 /// @param C                 the current node
00508 /////////////////////////////////////////////////////////////////////
00509 void AddToFCI(TreeNode *C) {
00510     // Search HT
00511     HashTable::iterator h = HT.find(C->Trans->_count);
00512 
00513     // If the support of node C is NOT in HashSup
00514     if (h == HT.end()) {
00515         // Add a new list to the HashSup
00516         ItemSet* newList = new ItemSet();
00517         newList->reserve(500);
00518         newList->push_back(MFI.size());
00519         HT.insert(HashTable::value_type(C->Trans->_count, newList));
00520 
00521         // Else add pointer to last item in iName to HT entry
00522     } else {
00523         for (ItemSet::reverse_iterator goli = (*h).second->rbegin();
00524                 goli != (*h).second->rend();
00525                 goli++)
00526             if (MFI[*goli]->Superset(C->Name))
00527                 return ;
00528 
00529         // search the table
00530         (*h).second->push_back(MFI.size());
00531     }
00532 
00533     // copy name bitmap
00534     BaseBitmap *name = new BaseBitmap(*C->Name);
00535 
00536     // add to MFI
00537     MFI.push_back(name);
00538 
00539     // Update stat variables
00540     MFIDepth += C->Name->_count;
00541     MFISize++;
00542 
00543     SupportCountList.push_back(C->Trans->_count);
00544     MFIBySizes[name->_count]++;
00545     if (name->_count > maxItemsetSize)
00546         maxItemsetSize = name->_count;
00547 }
00548 
00549 int SortLMFI(int rBegin, int rEnd, int sortBy) {
00550     int o = rBegin;
00551     int p = rEnd - 1;
00552     while ( o <= p) {
00553         while (o < rEnd && !MFI[o]->CheckPosition(sortBy, CountCheckPosition))
00554             o++;
00555         while (p >= rBegin && MFI[p]->CheckPosition(sortBy, CountCheckPosition))
00556             p--;
00557         if (o < p) {
00558             // we are now at a point where MFI[o] is relevant
00559             // and MFI[p] is not since o < p, we swap the two
00560             BaseBitmap* tempBitmap = MFI[o];
00561             MFI[o] = MFI[p];
00562             MFI[p] = tempBitmap;
00563             tempBitmap = NULL;
00564             o++;
00565             p--;
00566         }
00567     }
00568     // the first relevant one for the next node is o
00569     return o;
00570 }
00571 
00572 
00573 /////////////////////////////////////////////////////////////////////
00574 /// Determine whether a HUTMFI is true.
00575 ///     - if HUT is in MFI, then HUT is frequent
00576 ///     and the subtree rooted at this node can be pruned
00577 ///
00578 /// @return True if HUT is in the MFI
00579 /////////////////////////////////////////////////////////////////////
00580 bool CheckHUTMFI(TreeNode *C, int iTAIL) {
00581 
00582     // for each element i in the tail form {head} U {i} and check for a
00583     //     superset in the MFI
00584     int rBegin = C->rBegin;
00585     int rEnd = C->rEnd; 
00586     for (; iTAIL < C->tEnd; iTAIL++) {        
00587         rBegin = SortLMFI(rBegin, rEnd, F1[gTail[iTAIL].Item]->Prefix);        
00588            
00589         if (rEnd <= rBegin)
00590             return false;
00591     }
00592 
00593     return true;
00594 }
00595 
00596 
00597 /////////////////////////////////////////////////////////////////////
00598 /// Dynamically reorder the elements in the tail by increasing support
00599 ///    - Expand all children and sort by increasing support
00600 ///    - Remove infrequent children
00601 ///
00602 /// @param C                 current node
00603 /// @param iTAIL             index into tail of current node
00604 /// @param useComp           whether compressed bitmaps should be used
00605 /// @param NoChild           whether C has any frequent children
00606 /// @param AllFreq           whether all children are frequent
00607 /////////////////////////////////////////////////////////////////////
00608 void ReorderTail(
00609     TreeNode *C,
00610     int &iTAIL,
00611     bool useComp,
00612     bool &NoChild,
00613     bool &AllFreq) {
00614 
00615     int tailIndex = 0;
00616     int lol = 0;
00617     // for each tail element
00618     for (lol = iTAIL; lol < C->tEnd; lol++) {
00619         Bitmap *trans = TransBuffy[C->Depth];
00620         int theCount = 0;
00621 
00622         // Compress the bitmaps
00623         if (useComp && (F1[gTail[lol].Item]->compID != C->compID)) {
00624             F1[gTail[lol].Item]->Trans->BuildRelComp(
00625                 *TransBuffy[projectDepth]);
00626             F1[gTail[lol].Item]->compID = C->compID;
00627             CountRebuilds++;
00628         }
00629 
00630         // Use the compressed bitmaps
00631         if (useComp) {
00632             // AND the compressed bitmaps and count the result
00633             trans->AndCompOnly(
00634                 *C->Trans,
00635                 *F1[gTail[lol].Item]->Trans,
00636                 CountSmallAnds);
00637             theCount = trans->SmallCount(CountCounts);
00638 
00639             // use the full bitmaps
00640         } else {
00641             // AND & count the bitmaps
00642             trans->AndOnly(*C->Trans, *F1[gTail[lol].Item]->Trans, CountAnds);
00643             theCount = trans->Count(CountCounts);
00644         }
00645 
00646         // If the results is frequent
00647         if (theCount >= MS) {
00648             // if PEP pruning holds
00649             if (PEPrune && (trans->_count == C->Trans->_count)) {
00650                 // Move tail element from tail to head
00651                 C->Name->Or(*C->Name, *F1[gTail[lol].Item]->Name);
00652                 CountPEPrunes++;
00653 
00654                 // add tail element to reordered tail
00655             } else {
00656                 NoChild = false;
00657 
00658                 // create new tail element
00659                 TailBuffy[tailIndex].Count = theCount;
00660                 TailBuffy[tailIndex].Item = gTail[lol].Item;
00661                 tailIndex++;
00662             }
00663         } else
00664             AllFreq = false;
00665     }
00666 
00667     sort(TailBuffy, TailBuffy + tailIndex);
00668 
00669     // Set the begin and end values of the new tail
00670     iTAIL = C->tEnd;
00671     C->tEnd = iTAIL + tailIndex;
00672     C->tBegin = iTAIL;
00673     int r = 0;
00674 
00675     // Copy new tail into next slots
00676     for (lol = iTAIL; lol < C->tEnd; lol++) {
00677         gTail[lol] = TailBuffy[r];
00678         r++;
00679     }
00680 }
00681 
00682 /////////////////////////////////////////////////////////////////////
00683 /// Simply copy over the tail without expanding any of the children
00684 ///    for pure DFS (no expansion of all children)
00685 ///
00686 /// @param C                 current node
00687 /// @param iTAIL             index into tail of current node
00688 /////////////////////////////////////////////////////////////////////
00689 void NoorderTail(
00690     TreeNode *C,
00691     int &iTAIL) {
00692 
00693     // set begin and end tail pointers
00694     iTAIL = C->tEnd;
00695     C->tEnd = C->tEnd - C->tBegin + C->tEnd;
00696     C->tBegin = iTAIL;
00697     int r = 0;
00698 
00699     // copy over old tail to new tail
00700     for (int lol = iTAIL; lol < C->tEnd; lol++) {
00701         gTail[lol] = gTail[lol - C->tEnd + C->tBegin];
00702         r++;
00703     }
00704 }
00705 
00706 
00707 /////////////////////////////////////////////////////////////////////
00708 /// The main MAFIA algorithm function
00709 ///
00710 /// @param C                 the current node
00711 /// @param HUT               whether this is a HUT check (left most branch)
00712 /// @param FHUT              [output] whether the HUT is frequent
00713 /// @param useComp           if compression has been switched on
00714 /////////////////////////////////////////////////////////////////////
00715 void MAFIA(TreeNode *C, bool HUT, bool &FHUT, bool useComp) {
00716     CountNodes++;
00717     int iTAIL = C->tBegin;   // index into the tail
00718     bool NoChild = true;     // whether the node has any frequent children
00719     bool AllFreq = true;     // whether all the children are frequent
00720     FHUT = false;            // whether this is a FHUT
00721     int beforeCountNodes = CountNodes;
00722     int frequentTailSize = C->tEnd - iTAIL;
00723 
00724     if (iTAIL < C->tEnd) {
00725         /*// Check if a superset of the HUT is in the MFI
00726         if ((C == Root) && (HUTMFI)) {
00727             CountHUTMFI++;
00728             if (CheckHUTMFI(C, C->tBegin)) {
00729                 // stop generation of subtree from this node
00730                 CountHUTMFISuccess++;
00731                 return ;
00732             }
00733         }*/
00734 
00735         if (C != Root)
00736             if (Reorder)
00737                 ReorderTail(C, iTAIL, useComp, NoChild, AllFreq);
00738             else
00739                 NoorderTail(C, iTAIL);
00740 
00741         frequentTailSize = C->tEnd - iTAIL;
00742 
00743         int estimateTail = (int)(frequentTailSize / (double)EstimateDiv);
00744         if (estimateTail > 0 && C->Trans->_count != TransCount) {
00745             double estimateSubTree = EstimateBuffy[estimateTail].Sum / (double)EstimateBuffy[estimateTail].Count;
00746             double support = C->Trans->_count / (double) TransCount;
00747             double factor = 11.597 - 29.914 * (support - .52392) * (support - .52392);
00748             double cost = factor * frequentTailSize / (1 - support);
00749 
00750             // check if relative comp should be performed
00751             if ((!useComp) && (estimateSubTree > cost)) {
00752 
00753                 // build the relative bitmap for source [node bitmap] (all 1's)
00754                 C->Trans->BuildSource();
00755 
00756                 // remember the depth of the FULL bitmap your projecting from
00757                 projectDepth = C->Depth - 1;
00758 
00759                 // increment the ID
00760                 C->compID = MAX_compID;
00761                 MAX_compID++;
00762                 useComp = true;
00763             }
00764         }
00765 
00766         // Candidate generation - extend the Head with the tail elements
00767         // We start from the end of the tail and move backwards
00768         // Therefore the tail is iterated through in increasing support,
00769         // but is stored in decreasing support.
00770         while (iTAIL < C->tEnd) {
00771             // form a one-extension
00772             Bitmap *trans = TransBuffy[C->Depth];
00773             BaseBitmap *name = NameBuffy[C->Depth];
00774             TreeNode *node = NodeBuffy[C->Depth];
00775 
00776             // create name for the new node
00777             name->Or(*C->Name, *F1[gTail[iTAIL].Item]->Name);
00778 
00779             // compress the bitmaps if warranted
00780             if (useComp && (F1[gTail[iTAIL].Item]->compID != C->compID)) {
00781                 // build the relative for this node
00782                 F1[gTail[iTAIL].Item]->Trans->BuildRelComp(
00783                     *TransBuffy[projectDepth]);
00784                 F1[gTail[iTAIL].Item]->compID = C->compID;
00785                 CountRebuilds++;
00786             }
00787             
00788             int theCount = 0;
00789 
00790             // use the compressed bitmaps for ANDing and counting
00791             if (useComp) {
00792                 // AND and count small bitmaps
00793                 trans->AndCompOnly(
00794                     *C->Trans,
00795                     *F1[gTail[iTAIL].Item]->Trans,
00796                     CountSmallAnds);
00797 
00798                 if (Reorder)
00799                     trans->_count = gTail[iTAIL].Count;
00800                 else
00801                     theCount = trans->SmallCount(CountCounts);
00802             } else {
00803                 // AND and count the full bitmaps
00804                 trans->AndOnly(
00805                     *C->Trans,
00806                     *F1[gTail[iTAIL].Item]->Trans,
00807                     CountAnds);
00808 
00809                 if (Reorder)
00810                     trans->_count = gTail[iTAIL].Count;
00811                 else
00812                     theCount = trans->Count(CountCounts);
00813             }
00814             if (!Reorder && PEPrune && (theCount == C->Trans->_count)) {
00815                 CountPEPrunes++;
00816                 C->Name->Or(*C->Name, *F1[gTail[iTAIL].Item]->Name);
00817                 iTAIL++;
00818                 continue;
00819             }
00820 
00821             // Determine whether this candidate will be a HUT
00822             // Conceptually the leftmost branch of the tree is a HUT check
00823             if ((iTAIL != C->tBegin) && (C != Root))
00824                 HUT = 0;
00825             else
00826                 HUT = 1;
00827                 
00828             if (!AllFreq)
00829                 HUT = 0;
00830                 
00831             if (Reorder || (theCount >= MS)) {
00832                 // form the 1-extension node
00833                 node->setTreeNode(name,
00834                                   trans,
00835                                   C->Depth + 1,
00836                                   C->compID,
00837                                   F1[gTail[iTAIL].Item]->Prefix,
00838                                   iTAIL + 1,
00839                                   C->tEnd);
00840 
00841                 // setup the LMFI for the next level; it contains all
00842                 // itemsets in LMFI for this level that also include the
00843                 // one we're extending the node with.  We do sort of a
00844                 // quicksort thing to move the relevant itemsets for the
00845                 // next node to the end of the portion of the MFI relevant
00846                 // to this node
00847 
00848                 
00849                 node->rEnd = C->rEnd;                
00850                 node->rBegin = SortLMFI(C->rBegin, C->rEnd, node->Prefix);                
00851 
00852                 // Check for HUT in MFI for remaining tail
00853                 if (HUTMFI && node->tBegin != node->tEnd && !HUT) {
00854                     CountHUTMFI++;
00855 
00856                     if (CheckHUTMFI(node, node->tBegin)) {
00857                         // stop generation of extensions
00858                         CountHUTMFISuccess++;
00859                         AllFreq = false;
00860                         break;
00861                     }
00862                 }
00863 
00864                 NoChild = false;
00865                 // recurse down the tree
00866                 MAFIA(node, HUT, FHUT, useComp);
00867 
00868                 // Add those discovered from lower levels to the current LMFI
00869                 // LMFI_l = LMFI_l \union LMFI_{l+1}
00870                 // all we need to do is to update the end pointer
00871                 C->rEnd = node->rEnd;
00872             } else
00873                 AllFreq = false;
00874 
00875             // if this was a successful HUT check
00876             if (FHUT) {
00877                 // keep going up the tree
00878                 if (HUT) {
00879                     return ;
00880 
00881                     // reached start of HUT, so stop generation of subtree
00882                     // rooted at this node
00883                 } else {
00884                     FHUT = false;
00885                     break;
00886                 }
00887             }
00888 
00889             // Move on the next tail element
00890             iTAIL++;
00891         }
00892     }
00893 
00894     // if this is a FHUT
00895     if (HUT && AllFreq && GoFHUT) {
00896         FHUT = true;
00897         CountFHUT++;
00898     }
00899 
00900     // if this node is childless and not in MFI
00901     if (MethodIsFI)
00902         AddToFI(C);
00903     else if (MethodIsFCI && C != Root)
00904         AddToFCI(C);
00905     else if (NoChild && !LMFISuperSet(C)) {
00906         AddToMFI(C);
00907     }
00908 
00909     int subtreeSize = CountNodes - beforeCountNodes + 1;
00910     int estimateTail = (int)(frequentTailSize / (double)EstimateDiv);
00911     if (estimateTail > 0 && C->Trans->_count != TransCount) {
00912         EstimateBuffy[estimateTail].Count++;
00913         EstimateBuffy[estimateTail].Sum += subtreeSize;
00914     }
00915 }
00916 
00917 
00918 /////////////////////////////////////////////////////////////////////
00919 /// Merge repeated itemsets into one combined itemset
00920 ///    - e.g. if (transaction set of item 4) AND
00921 ///    (transaction set item 5) = (transaction set item 5),
00922 ///    then item 5 is a duplicate of item 4
00923 ///    due to increasing support
00924 /////////////////////////////////////////////////////////////////////
00925 void MergeRepeatedItemsets() {
00926     NodeList::iterator bali = F1.begin();
00927     Bitmap out(*(*bali)->Trans);
00928     int blah = 0;
00929 
00930     // for each frequent 1-itemset
00931     while (bali != F1.end()) {
00932         out.FillOnes();
00933         NodeList::iterator noli = bali;
00934         noli++;
00935 
00936         // search for a copy of the itemset's transaction set
00937         while (noli != F1.end()) {
00938             // stop when count is no longer the same
00939             if ((*bali)->Trans->_count != (*noli)->Trans->_count)
00940                 break;
00941             else {
00942                 // AND itemsets with the same count
00943                 out.AndOnly(*(*bali)->Trans, *(*noli)->Trans, CountAnds);
00944                 out.Count(blah);
00945 
00946                 // check for a duplicate
00947                 if (out._count == (*noli)->Trans->_count) {
00948                     (*bali)->Name->Or(*(*bali)->Name, *(*noli)->Name);
00949                     F1.erase(noli);
00950                 } else
00951                     noli++;
00952             }
00953         }
00954         bali++;
00955     }
00956 }
00957 /** @} */
00958 
00959 
00960 /*********************************************************************
00961  MAIN
00962 *********************************************************************/ 
00963 /// @defgroup MainFunction Main Function
00964 /// Main function for running the program
00965 /** @{ */
00966 
00967 /////////////////////////////////////////////////////////////////////
00968 // main function for the program
00969 /////////////////////////////////////////////////////////////////////
00970 int main(int argc, char **argv) {
00971     // Check parameters
00972 
00973     if (argc < 5) {
00974         cerr << "Usage: " << argv[0] << " [-mfi/-fci/-fi] [min sup (percent)] " << endl;
00975         cerr << "\t[-ascii/-binary] [input filename] " << endl;
00976         cerr << "\t[output filename (optional)]" << endl;
00977         cerr << "Ex: " << argv[0] << " -mfi .5 -ascii connect4.ascii mfi.txt" << endl;
00978         cerr << "Ex: " << argv[0] << " -mfi .3 -binary chess.binary" << endl;
00979         exit(0);
00980     }
00981 
00982     // time hook
00983     time(&total_start);
00984 
00985     // get the algorithm type
00986     method = argv[1];
00987 
00988     // Minimum support as a number
00989     MSF = atof(argv[2]);
00990     if (strcmp(argv[3], "-ascii") == 0)
00991         F1FromFile(argv[4], true);
00992     else if (strcmp(argv[3], "-binary") == 0)
00993         F1FromFile(argv[4], false);
00994     else {
00995         cerr << "File format must be -ascii or -binary" << endl;
00996         exit(1);
00997     }
00998 
00999     if (argc == 6) {
01000         outputMFI = true;
01001         outFilename = argv[5];
01002     }
01003 
01004     // Create a null node to begin the DFS tree
01005     NullTrans = new Bitmap(TransCount);
01006     NullTrans->FillOnes();
01007     NullTrans->_count = TransCount;
01008     ItemSet NullList;
01009 
01010     MFIBySizes = new int[MAX_ITEMSET_SIZE];
01011     for (int h = 0; h < MAX_ITEMSET_SIZE; h++) {
01012         MFIBySizes[h] = 0;
01013     }
01014 
01015     // Set size of F1
01016     FullF1size = F1size = F1.size();
01017 
01018     // if F1 is not empty
01019     if (FullF1size != 0) {
01020 
01021         BaseBitmap *NullName = new BaseBitmap(FullF1size);
01022         MFI.reserve(100000);
01023 
01024         int p = 0;
01025         ItemMap = new int[FullF1size];
01026         ItemsetBuffy = new int[FullF1size];
01027 
01028         // Rename items in F1
01029         for (NodeList::iterator nli = F1.begin(); nli != F1.end(); nli++) {
01030             // store old itemid
01031             ItemMap[p] = (*nli)->Prefix;
01032 
01033             // assign new itemid
01034             (*nli)->Prefix = p;
01035 
01036             // assign name bitmaps
01037             (*nli)->Name = new BaseBitmap(FullF1size);
01038             (*nli)->Name->FillEmptyPosition(p);
01039             (*nli)->Name->_count = 1;
01040             p++;
01041         }
01042 
01043         // handle repeated itemsets
01044 
01045         MergeRepeatedItemsets();
01046         F1size = F1.size();
01047 
01048         // Create global tail
01049         maxtail = F1size * (F1size + 1) / 2;
01050         gTail = new TailElement[maxtail];
01051 
01052         // Create buffer for sorting
01053         TailBuffy = new TailElement[F1size];
01054 
01055         // Create buffer for estimating size of each subtree
01056         EstimateSize = (int)ceil(F1size / (double)EstimateDiv);
01057         EstimateBuffy = new SubtreeEstimate[EstimateSize];
01058         for (int estimateIndex = 0; estimateIndex < EstimateSize; estimateIndex++) {
01059             EstimateBuffy[estimateIndex].Count = 1;
01060             EstimateBuffy[estimateIndex].Sum = estimateIndex * EstimateDiv * estimateIndex * EstimateDiv / 2;
01061         }
01062 
01063         // Initialize global tail
01064         int uu;
01065         for (uu = 0; uu < maxtail; uu++) {
01066             gTail[uu].Item = -1;
01067             gTail[uu].Count = 0;
01068         }
01069 
01070         // Fill global tail
01071         for (uu = 0; uu < F1size; uu++) {
01072             gTail[uu].Item = uu;
01073             gTail[uu].Count = F1[uu]->Trans->_count;
01074 
01075             // assign tail index
01076             F1[uu]->tBegin = uu + 1;
01077             if (uu == F1size - 1)
01078                 F1[uu]->tBegin = -1;
01079 
01080             TempName = new BaseBitmap(FullF1size);
01081 
01082             // add a buffer element for each item in F1
01083             BaseBitmap *name = new BaseBitmap(FullF1size);
01084             NameBuffy.push_back(name);
01085             Bitmap *buff = new Bitmap(TransCount);
01086             TransBuffy.push_back(buff);
01087             TreeNode *newNode = new TreeNode();
01088             NodeBuffy.push_back(newNode);
01089         }
01090 
01091         srand(666);
01092         bool FHUT;
01093 
01094         // start algorithm timer
01095         clock_t start, finish;
01096         double duration = -1;
01097         start = clock();
01098 
01099         time(&algorithm_start);
01100 
01101         // create root node and its associated tail
01102         Root = new TreeNode(NullName, NullTrans, 0, 0, -1, 0, F1size);
01103 
01104         //Nothing is in MFI, so nothing is relevant
01105         Root->rBegin = 0;
01106         Root->rEnd = 0;
01107 
01108         // run the appropriate algorithm
01109         if (method.compare("-fci") == 0) {
01110             //cout << "running closure (FCI) algorithm..." << endl;
01111 
01112             GoFHUT = false;      // FHUT flag
01113             HUTMFI = false;      // HUTMFI flag
01114             PEPrune = true;      // PEPrune flag
01115             Reorder = true;      // Reorder flag
01116             MethodIsFCI = true;
01117 
01118             MAFIA(Root, false, FHUT, false);            
01119         } else if (method.compare("-mfi") == 0) {
01120             //cout << "running MFI algorithm..." << endl;
01121             GoFHUT = true;      // FHUT flag
01122             HUTMFI = true;      // HUTMFI flag
01123             PEPrune = false;     // PEPrune flag
01124             Reorder = true;     // Reorder flag
01125             
01126             MAFIA(Root, true, FHUT, false);            
01127         } else if (method.compare("-fi") == 0) {
01128             if (outputMFI) {
01129                 // open output file
01130                 outFile = new ItemsetOutput(outFilename);
01131                 if (!outFile->isOpen()) {
01132                     cerr << "Output file not open!" << endl;
01133                     exit(1);
01134                 }
01135             }
01136 
01137             //cout << "running FI algorithm..." << endl;
01138             GoFHUT = false;      // FHUT flag
01139             HUTMFI = false;      // HUTMFI flag
01140             PEPrune = false;     // PEPrune flag
01141             Reorder = false;     // Reorder flag
01142             MethodIsFI = true;
01143 
01144             MAFIA(Root, false, FHUT, false);
01145 
01146             if (outputMFI) {
01147                 delete outFile;
01148             }            
01149         } else {
01150             cerr << "Invalid algorithm option!" << endl;
01151             exit(0);
01152         }
01153 
01154         finish = clock();
01155         duration = (finish - start) / (double)CLOCKS_PER_SEC;
01156 
01157         printf( "Algorithm CPU time: %.3f seconds.\n", duration );
01158 
01159         time(&algorithm_finish);
01160         algorithm_time = difftime(algorithm_finish, algorithm_start);
01161         printf("Algorithm time: %.2f seconds.\n", algorithm_time);
01162     } else {
01163         MFIBySizes[0]++;
01164     }
01165 
01166     #ifdef DEBUG
01167     // output stat data
01168     cout << "F1size:      " << FullF1size << endl;
01169     cout << "MFISize:     " << MFI.size() << endl;
01170     cout << "MFIAvg: " << MFIDepth / (double) MFISize << endl;
01171     cout << "CountNodes: " << CountNodes << endl;
01172     cout << "CountAnds: " << CountAnds << endl;
01173     cout << "CountSmallAnds: " << CountSmallAnds << endl;
01174     cout << "CountRebuilds: " << CountRebuilds << endl;
01175     cout << "CountCounts: " << CountCounts << endl << endl;
01176     cout << "CountFHUT:   " << CountFHUT << endl;
01177     cout << "CountHUTMFI: " << CountHUTMFI << endl;
01178     cout << "CountHUTMFISuccess: " << CountHUTMFISuccess << endl;
01179     cout << "CountCheckPosition:   " << CountCheckPosition << endl;
01180     cout << "CountPEPrunes:        " << CountPEPrunes << endl;
01181 
01182     /*
01183     cout << "\nBucket count: " << HT.size() << endl;
01184     int minBucket = 10000000;
01185     int maxBucket = -1;
01186     long bucketSum = 0;    
01187     for (HashTable::iterator hash = HT.begin(); hash != HT.end(); hash++) {
01188         cerr << (*hash).first << " ";
01189         ItemSet *itemset = (*hash).second;
01190         int bucketSize = itemset->size();
01191         if (bucketSize < minBucket)
01192             minBucket = bucketSize;
01193         if (bucketSize > maxBucket)
01194             maxBucket = bucketSize;
01195 
01196         bucketSum += bucketSize;
01197         cerr << bucketSize << endl;            
01198     }
01199 
01200     cout << "Bucket avg: " << bucketSum / (double) HT.size() << endl;
01201     cout << "Bucket min: " << minBucket << endl;
01202     cout << "Bucket max: " << maxBucket << endl;
01203     */
01204     
01205     #endif
01206 
01207     /*for (int sizeIndex = 0; sizeIndex <= maxItemsetSize; sizeIndex++) {
01208         cout << MFIBySizes[sizeIndex] << endl;
01209     }*/
01210 
01211     if (outputMFI && !MethodIsFI) {
01212         // PrintMFI to a file
01213         //cout << "Printing out the mfi..." << endl;
01214         time(&print_start);
01215 
01216         PrintMFI();
01217 
01218         time(&print_finish);
01219         print_time = difftime(print_finish, print_start);
01220         printf("Printing output time: %.2f seconds.\n", print_time);
01221     }
01222 
01223     time(&total_finish);
01224     total_time = difftime(total_finish, total_start);
01225     printf("Total time: %.2f seconds.\n\n\n", total_time);
01226 
01227     return 0;
01228 }
01229 /** @} */
01230 
01231 
01232 /////////////////////////////////////////////////////////////////////
01233 /// @mainpage MAFIA Code Documentation
01234 /// \image html MafiaLogo.jpg
01235 ///
01236 /// \section ContactSection Contact
01237 /// - Manuel Calimlim (calimlim@cs.cornell.edu)
01238 /// - Johannes Gehrke (johannes@cs.cornell.edu)
01239 ///
01240 /// \section DownloadSection Download
01241 /// - Main MAFIA webpage
01242 ///   http://himalaya-tools.sourceforge.net/Mafia
01243 ///
01244 /// \section LinuxCompilationSection Linux Compilation
01245 /// -# ./configure
01246 /// -# make
01247 ///     - executable is created as 'src/mafia'
01248 ///
01249 /// \section WindowsCompilationSection Windows Compilation
01250 /// -# Use cygwin and follow the Linux instructions
01251 /// or
01252 /// -# Use a Windows Compiler or IDE (such as Visual Studio)
01253 /// to compile the source code.
01254 ///
01255 /// \section DirectoryStructureSection Directory Structure
01256 /// <table border=1 cellspacing=5 cellpadding=5>
01257 /// <tr>
01258 /// <td>admin/</td>
01259 /// <td>Contains config files for compiling.  Should
01260 ///     not be altered.</td>
01261 /// </tr>
01262 /// <tr>
01263 /// <td>src/</td>
01264 /// <td>Contains all of source code for MAFIA</td>
01265 /// </tr>
01266 /// <tr>
01267 /// <td>src/Transaction.h
01268 /// <br>src/Transaction.cpp</td>
01269 /// <td>Class for reading transactions from ASCII datasets</td>
01270 /// </tr>
01271 /// <tr>
01272 /// <td>src/ItemsetOutput.h
01273 /// <br>src/ItemsetOutput.cpp</td>
01274 /// <td>Class for writing itemsets to file output</td>
01275 /// </tr>
01276 /// <tr>
01277 /// <td>src/BaseBitmap.h
01278 /// <br>src/BaseBitmap.cpp</td>
01279 /// <td>Simple bitmap class for name bitmaps</td>
01280 /// </tr>
01281 /// <tr>
01282 /// <td>src/Bitmap.h
01283 /// <br>src/Bitmap.cpp</td>
01284 /// <td>Main bitmap class for transaction bitmaps</td>
01285 /// </tr>
01286 /// <tr>
01287 /// <td>src/Mafia.cpp</td>
01288 /// <td>Main class file with most of the MAFIA code</td>
01289 /// </tr>
01290 /// <tr>
01291 /// <td>src/Tables.h</td>
01292 /// <td>Stores precomputed lookup tables (not included in
01293 ///    documentation due to very large tables)</td>
01294 /// </tr>
01295 /// <tr>
01296 /// <td>src/TreeNode.h</td>
01297 /// <td>Class for representing nodes in the search tree</td>
01298 /// </tr>
01299 /// <tr>
01300 /// <td>INSTALL</td>
01301 /// <td>Generic installation instructions for Linux/Unix</td>
01302 /// </tr>
01303 /// <tr>
01304 /// <td>mafia.{kdevprj,kdevses}</td>
01305 /// <td>KDevelop project files for Linux</td>
01306 /// </tr>
01307 /// <tr>
01308 /// <td>README</td>
01309 /// <td>Pointer this page</td>
01310 /// </tr>
01311 /// </table>
01312 ///
01313 /// \section UsageSection Program Usage
01314 /// <pre>
01315 /// Usage: mafia [-mfi/-fci/-fi] [min sup (percent)]
01316 ///         [-ascii/-binary] [input filename]
01317 ///         [output filename (optional)]
01318 /// Ex: mafia -mfi .5 -ascii connect4.ascii mfi.txt
01319 /// Ex: mafia -mfi .3 -binary chess.binary
01320 /// </pre>
01321 ///
01322 /// \section InputSection File Input
01323 /// Datasets can be in ASCII or binary format.
01324 /// For ASCII files, the file format must be:
01325 /// <pre>
01326 /// [item_id_1] [item_id_2] ... [item_id_n]
01327 /// </pre>
01328 ///
01329 /// Items do not have to be sorted within each transaction.
01330 /// Items are separated by spaces and each transaction
01331 /// should end with a newline, e.g.
01332 /// <pre>
01333 /// 1 4 2
01334 /// 2 8 9 4
01335 /// 2 5
01336 /// </pre>
01337 ///
01338 /// For binary files, the file format must be:
01339 /// <pre>
01340 /// [custid][transid][number of items][itemid_1][itemid_2] ... [itemid_n]
01341 /// </pre>
01342 ///
01343 /// The custid and transid numbers are ignored at this time.
01344 /// Since the file is in binary format, all numbers are read
01345 /// as integers.
01346 ///
01347 /// \section DatasetsSection Datasets
01348 /// Download <a href="http://sourceforge.net/project/showfiles.php?group_id=72667">Datasets-ascii.tar.gz</a>
01349 /// or <a href="http://sourceforge.net/project/showfiles.php?group_id=72667">Datasets-binary.tar.gz</a>
01350 /// for the full set of datasets used for testing:
01351 /// - chess.{ascii,binary}
01352 /// - connect4.{ascii,binary}
01353 /// - mushroom.{ascii,binary}
01354 /// - pumsb.{ascii,binary}
01355 /// - pumsb_star.{ascii,binary}
01356 ///
01357 /// \section OutputSection Program Output
01358 /// The frequent itemsets outputted by the program are
01359 /// in ASCII form with the following format:
01360 /// <pre>
01361 /// [item_id_1] [item_id_2] ... [item_id_n] [(support)]
01362 /// </pre>
01363 /// Ex:
01364 /// <pre>
01365 /// 28 64 42 60 40 29 52 58 (966)
01366 /// 46 64 42 3 25 9 5 48 66 56 34 62 7 36 60 40 29 52 58 (962)
01367 /// 39 36 40 29 52 58 (960)
01368 /// </pre>
01369 /////////////////////////////////////////////////////////////////////
01370 

Generated on Thu Dec 4 13:38:32 2003 for MAFIA by doxygen1.2.18