Hi Duc,
I did not read in detail your function, but you can also use
BATsample_ (which is the void headed version of BATsample)
lefteris
On Wed, Feb 26, 2014 at 2:10 PM, Minh-Duc Pham <commits@monetdb.org> wrote:
Changeset: e7109fc24610 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7109fc24610
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Generate sample data for all tables
diffs (truncated from 721 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -695,6 +695,11 @@ char isInfrequentSampleProp(CS freqCS, i
if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1;
else return 0;
}
+static
+char isInfrequentSampleCol(CS freqCS, PropTypes pt){
+ if (pt.propFreq * 100 < freqCS.support * SAMPLE_FILTER_THRESHOLD) return 1;
+ else return 0;
+}
static
void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* freqCSset){
@@ -4830,6 +4835,211 @@ void initSampleData(CSSample *csSample,B
}
}
+
+
+static
+void getSubjIdFromTablePosition(int tblIdx, int pos, oid *sOid){
+ oid id;
+ id = pos;
+ id |= (BUN)(tblIdx + 1) << (sizeof(BUN)*8 - NBITS_FOR_CSID);
+ *sOid = id;
+}
+
+static
+str getOrigSbt(oid *sbt, oid *origSbt, BAT *lmap, BAT *rmap){
+ BUN pos;
+ oid *tmp;
+ pos = BUNfnd(BATmirror(rmap),sbt);
+ if (pos == BUN_NONE){
+ throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded subject must be in rmap");
+ }
+ tmp = (oid *) Tloc(lmap, pos);
+ if (*tmp == BUN_NONE){
+ throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded subject must be in lmap");
+ }
+
+ *origSbt = *tmp;
+
+ return MAL_SUCCEED;
+}
+
+static
+str getOrigObt(oid *obt, oid *origObt, BAT *lmap, BAT *rmap){
+ BUN pos;
+ oid *tmp;
+ oid tmporigOid = BUN_NONE;
+ char objType;
+ BUN maxObjectURIOid = ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID - 1)) - 1; //Base on getTblIdxFromS
+
+ objType = getObjType(*obt);
+
+ if (objType == URI || objType == BLANKNODE){
+ tmporigOid = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4));
+ }
+
+ if (tmporigOid > maxObjectURIOid){
+ pos = BUNfnd(BATmirror(rmap),&tmporigOid);
+ if (pos == BUN_NONE){
+ throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded object must be in rmap");
+ }
+ tmp = (oid *) Tloc(lmap, pos);
+ if (*tmp == BUN_NONE){
+ throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded object must be in lmap");
+ }
+
+ *origObt = *tmp;
+ }
+ else{
+ *origObt = tmporigOid;
+ }
+
+ return MAL_SUCCEED;
+}
+
+static
+str initFullSampleData(CSSampleExtend *csSampleEx, int *mTblIdxFreqIdxMapping, CSlabel *label, CStableStat* cstablestat, CSPropTypes *csPropTypes, CSset *freqCSset, int numTables, bat *lmapbatid, bat *rmapbatid){
+ int i, j, k;
+ int freqId;
+ int tmpNumcand;
+ oid tmpCandidate;
+ int randValue = 0;
+ int ranPosition = 0; //random position of the instance in a table
+ int tmpNumCols;
+ int colIdx;
+ BAT *tmpbat = NULL;
+ BATiter tmpi;
+ BAT *cursamplebat = NULL;
+ int tmpNumRows = 0;
+ oid tmpSoid = BUN_NONE, origSoid = BUN_NONE;
+ oid origOid = BUN_NONE;
+ BAT *lmap = NULL, *rmap = NULL;
+
+ if ((lmap = BATdescriptor(*lmapbatid)) == NULL) {
+ throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
+ }
+
+ if ((rmap = BATdescriptor(*rmapbatid)) == NULL) {
+ BBPreleaseref(lmap->batCacheid);
+ throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
+ }
+ srand(123456);
+ for (i = 0; i < numTables; i++){
+ freqId = mTblIdxFreqIdxMapping[i];
+ csSampleEx[i].freqIdx = freqId;
+ tmpNumcand = (NUM_SAMPLE_CANDIDATE > label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE;
+ csSampleEx[i].name = cstablestat->lstcstable[i].tblname;
+ csSampleEx[i].candidateCount = tmpNumcand;
+ csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * tmpNumcand);
+ for (k = 0; k < tmpNumcand; k++){
+ csSampleEx[i].candidates[k] = label[freqId].candidates[k];
+ }
+ //Randomly exchange the value, change the position k with a random pos
+ for (k = 0; k < tmpNumcand; k++){
+ randValue = rand() % tmpNumcand;
+ tmpCandidate = csSampleEx[i].candidates[k];
+ csSampleEx[i].candidates[k] = csSampleEx[i].candidates[randValue];
+ csSampleEx[i].candidates[randValue] = tmpCandidate;
+ }
+
+ csSampleEx[i].lstSubjOid = (oid*)malloc(sizeof(oid) * NUM_SAMPLE_INSTANCE);
+ for (k = 0; k < NUM_SAMPLE_INSTANCE; k++)
+ csSampleEx[i].lstSubjOid[k] = BUN_NONE;
+
+ tmpNumCols = csPropTypes[i].numProp - csPropTypes[i].numInfreqProp; //already remove infrequent column;
+ csSampleEx[i].numProp = tmpNumCols;
+
+ assert(tmpNumCols > 0);
+
+ csSampleEx[i].lstProp = (oid*)malloc(sizeof(oid) * tmpNumCols);
+ csSampleEx[i].lstIsInfrequentProp = (char*)malloc(sizeof(char) * tmpNumCols);
+ csSampleEx[i].lstIsMVCol = (char*)malloc(sizeof(char) * tmpNumCols);
+ csSampleEx[i].colBats = (BAT**)malloc(sizeof(BAT*) * tmpNumCols);
+ colIdx = -1;
+ csSampleEx[i].numInstances = 0;
+ for(j = 0; j < csPropTypes[i].numProp; j++){
+ #if REMOVE_INFREQ_PROP
+ if (csPropTypes[i].lstPropTypes[j].defColIdx == -1) continue; //Infrequent prop
+ #endif
+ colIdx++;
+ csSampleEx[i].lstProp[colIdx] = csPropTypes[i].lstPropTypes[j].prop;
+
+ csSampleEx[i].colBats[colIdx] = BATnew(TYPE_void, cstablestat->lstcstable[i].colBats[colIdx]->ttype , NUM_SAMPLE_INSTANCE + 1);
+
+ //Mark whether this col is infrequent sample cols
+ if ( isInfrequentSampleCol(freqCSset->items[freqId], csPropTypes[i].lstPropTypes[j])){
+ csSampleEx[i].lstIsInfrequentProp[colIdx] = 1;
+ }
+ else
+ csSampleEx[i].lstIsInfrequentProp[colIdx] = 0;
+
+ //Mark whther this col is a MV col
+ csSampleEx[i].lstIsMVCol[colIdx] = csPropTypes[i].lstPropTypes[j].isMVProp;
+
+ //if this is a multivalue column, get the data type of the first column
+
+ }
+ assert(colIdx == (tmpNumCols - 1));
+
+
+ // Inserting instances to csSampleEx
+
+ tmpNumRows = BATcount(cstablestat->lstcstable[i].colBats[0]);
+
+ for (k = 0; k < NUM_SAMPLE_INSTANCE; k++){
+ ranPosition = rand() % tmpNumRows;
+
+ getSubjIdFromTablePosition(i, ranPosition, &tmpSoid);
+
+ if (getOrigSbt(&tmpSoid, &origSoid, lmap, rmap) != MAL_SUCCEED){
+ throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal sbt ");
+ }
+
+ csSampleEx[i].lstSubjOid[k] = origSoid;
+
+ for (j = 0; j < tmpNumCols; j++){
+ cursamplebat = csSampleEx[i].colBats[j];
+
+ tmpbat = cstablestat->lstcstable[i].colBats[j];
+ tmpi = bat_iterator(tmpbat);
+
+ if (tmpbat->ttype == TYPE_oid && csSampleEx[i].lstIsMVCol[j] == 0){
+ //Get the original object oid
+ oid *tmpOid = (oid *) BUNtail(tmpi, ranPosition);
+ if(*tmpOid != oid_nil){
+ if (getOrigObt(tmpOid, &origOid, lmap, rmap) != MAL_SUCCEED){
+ throw(RDF, "rdf.RDFdistTriplesToCSs","Problem in getting the orignal obt ");
+ }
+ BUNappend(cursamplebat, &origOid, TRUE);
+ }
+ else{
+ BUNappend(cursamplebat, ATOMnilptr(TYPE_oid), TRUE);
+ }
+
+ }
+ else
+ BUNappend(cursamplebat, BUNtail(tmpi, ranPosition), TRUE);
+
+
+
+ }
+ csSampleEx[i].numInstances++;
+ }
+
+ if (i == 0)
+ for (j = 0; j < tmpNumCols; j++){
+ //BATprint(cstablestat->lstcstable[i].colBats[j]);
+ BATprint(csSampleEx[i].colBats[j]);
+ }
+
+ }
+
+ BBPunfix(lmap->batCacheid);
+ BBPunfix(rmap->batCacheid);
+
+ return MAL_SUCCEED;
+
+}
+
static
void freeSampleData(CSSample *csSample, int numCand){
int i, j;
@@ -4846,6 +5056,25 @@ void freeSampleData(CSSample *csSample,
free(csSample);
}
+
+static
+void freeSampleExData(CSSampleExtend *csSampleEx, int numCand){
+ int i, j;
+ for (i = 0; i < numCand; i++){
+ free(csSampleEx[i].lstProp);
+ free(csSampleEx[i].lstIsInfrequentProp);
+ free(csSampleEx[i].lstIsMVCol);
+ free(csSampleEx[i].candidates);
+ free(csSampleEx[i].lstSubjOid);
+ for (j = 0; j < csSampleEx[i].numProp; j++){
+ BBPunfix(csSampleEx[i].colBats[j]->batCacheid);
+ }
+ free(csSampleEx[i].colBats);
+ }
+
+ free(csSampleEx);
+}
+
static
void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int sampleIdx, CSSample *csSample){
int i,j;
@@ -5217,6 +5446,295 @@ str printSampleData(CSSample *csSample,
return MAL_SUCCEED;
}
+#if 0
+static
+str printFullSampleData(CSSampleExtend *csSampleEx, CSset *freqCSset, BAT *mbat, int num, int sampleVersion){
+
+ int i,j, k;
+ FILE *fout, *fouttb, *foutis;
+ char filename[100], filename2[100], filename3[100];
+ char tmpStr[20], tmpStr2[20], tmpStr3[20];
+ int ret;
+
+ str propStr;
+ str subjStr;
+ char* schema = "rdf";
+ CSSample sample;
+ CS freqCS;
+ char objType = 0;
+ str objStr;
+ oid objOid = BUN_NONE;
+ BATiter mapi;
+ str canStr;
+ char isTitle = 0;
+ char isUrl = 0;
+ char isType = 0;
+ char isDescription = 0;
+ char isImage = 0;
+ char isSite = 0;
+ char isEmail = 0;
+ char isCountry = 0;
+ char isLocality = 0;
+ BAT *lmap = NULL, *rmap = NULL
+#if USE_SHORT_NAMES
+ str propStrShort = NULL;
+ char *pch;
+#endif
+
+
+
+ mapi = bat_iterator(mbat);
+
+ if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+ throw(RDF, "rdf.rdfschema",
+ "could not open the tokenizer\n");
+ }
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list