WeightedListItem
A dictionary with weights. example:Content type: String (must be a valid filesystem path)
Name of the file containing the weights.Example file:
SIMPLE:
create terminators;
set types = (varchar);
set weights = 1;
add (".": 10);
add ("\;": 2);
add (" --": 1);
add (":": 1);
Complex:-- fields weights (aliases in parens)
-- =============
-- 1: FIPS code (fips) 1: uniform (uniform)
-- 2: county name (name) 2: population (population)
-- 3: state abreviation (st) 3: timezone weighting (tz)
-- 4: full state name (state) 4: in zone1 (tz90)
-- 5: ZIP prefix (zone) 5: in zone2 (tz9)
-- 6: gmt offset (gmt) 6 in zone3 (tz1)
--
create fips_county;
set types = (int, varchar, varchar, varchar, varchar, int);
set weights = 6;
set names = (fips, county, st, state, zone, gmt:uniform, population, tz, tz90, tz9, tz1);
add (47187,"Williamson County", "TN", "Tennesee", "3", -5:1, 117569, 1387, 1, 0, 0);
add (46137,"Ziebach County", "SD", "South Dakota", "5", -6:1, 2176, 1148, 1, 0, 0);
add (01127,"Walker County", "AL", "Alabama", "3", -6:1, 71027, 1148, 1, 0, 0);
add (45039,"Fairfield County", "SC", "South Carolina", "2", -5:1, 22394, 1387, 1, 0, 0);
add (39139,"Richland County", "OH", "Ohio", "4", -5:1, 127342, 1387, 1, 0, 0);
add (22041,"Franklin Parish", "LA", "Louisiana", "7", -6:1, 22163, 1148, 1, 0, 0);
add (29061,"Daviess County", "MO", "Mosourri", "6", -6:1, 7842, 1148, 1, 0, 0);
Name | Description | Required | Min | Max | Allowed Values |
---|---|---|---|---|---|
filename | Content type: String (must be a valid filesystem path) Name of the file containing the weights.Example file: SIMPLE: create terminators; set types = (varchar); set weights = 1; add (".": 10); add ("\;": 2); add (" --": 1); add (":": 1); Complex:-- fields weights (aliases in parens) -- ============= -- 1: FIPS code (fips) 1: uniform (uniform) -- 2: county name (name) 2: population (population) -- 3: state abreviation (st) 3: timezone weighting (tz) -- 4: full state name (state) 4: in zone1 (tz90) -- 5: ZIP prefix (zone) 5: in zone2 (tz9) -- 6: gmt offset (gmt) 6 in zone3 (tz1) -- create fips_county; set types = (int, varchar, varchar, varchar, varchar, int); set weights = 6; set names = (fips, county, st, state, zone, gmt:uniform, population, tz, tz90, tz9, tz1); add (47187,"Williamson County", "TN", "Tennesee", "3", -5:1, 117569, 1387, 1, 0, 0); add (46137,"Ziebach County", "SD", "South Dakota", "5", -6:1, 2176, 1148, 1, 0, 0); add (01127,"Walker County", "AL", "Alabama", "3", -6:1, 71027, 1148, 1, 0, 0); add (45039,"Fairfield County", "SC", "South Carolina", "2", -5:1, 22394, 1387, 1, 0, 0); add (39139,"Richland County", "OH", "Ohio", "4", -5:1, 127342, 1387, 1, 0, 0); add (22041,"Franklin Parish", "LA", "Louisiana", "7", -6:1, 22163, 1148, 1, 0, 0); add (29061,"Daviess County", "MO", "Mosourri", "6", -6:1, 7842, 1148, 1, 0, 0); |
no | 0 | 1 | |
seed | Random number generator seed of this Element. Overrides default seeding behavior. | no | 0 | 1 | |
name | (Class)Name of this element. Used to identify plugin Class. Full name is required. Example: com.en.myPluginPackage.myPuginClass | no | 0 | 1 | |
valueColumn | Content type: String The specified list's column which contains the value to generate. Must match a column specified in the respective 'set names' statement. |
yes | 1 | 1 | |
weightColumn | Content type: String The specified list's column which contains the weight to use. Must match a column specified in the respective 'set names' statement. |
yes | 1 | 1 | |
id | Identification String of this element. May be used to uniquely identify a field within the children of an Element. | no | 0 | 1 | |
list | Content type: String The name of the list to use from the specified weight file. A list in the file starts with the keyword 'create'. |
yes | 1 | 1 |
Name | Description | Required | Min | Max | Allowed Values |
---|---|---|---|---|---|
sameChoiceAs | Content type: Empty Requires a <field> and a <generatorByID> attribute (in same table) to pick the row number from. If specified this WeightedListItem does not choose a random row, but it uses the same row as the referenced generator. |
no | 0 | 1 |
Examples
-
Uniform Weekdays
Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1, Tuesday: 1, Wednesday: 1, Thursday: 1, Friday: 1, Saturday: 1, Sunday: 1. The generated weekdays are all equally likely to be generated given that they are all weighted with 1.
Schema config for Uniform Weekdays
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <!-- /******************************************************************************* * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved. * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. ******************************************************************************/ --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd"> <!-- All data is derived from this starting seed. If this seed is the same, the generated data will the same on each computer/node/platform. Change this seed to generate a different data set.--> <seed>1234567890L</seed> <rng name="PdgfDefaultRandom"/> <!--Default Scale factor for all tables --> <property name="SF" type="double">1</property> <table name="WEIGHTED_LIST_ITEM"> <!-- if tables should scale with -SF command line argument. Specify your scaling formula here: --> <size>50 * ${SF}</size> <!--Uniform Weekdays--> <!-- Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1, Tuesday: 1, Wednesday: 1, Thursday: 1, Friday: 1, Saturday: 1, Sunday: 1. The generated weekdays are all equally likely to be generated given that they are all weighted with 1. --> <field name="weekday_uniform" size="" type="VARCHAR"> <gen_WeightedListItem filename="dicts/bigbench/ds-genProbabilities.txt" list="exampleList" valueColumn="day" weightColumn="uniform"/> </field> </table> </schema>
Output for Uniform Weekdays
Thursday Wednesday Wednesday Sunday Friday Sunday Sunday Wednesday Wednesday Tuesday Monday Monday Thursday Monday Saturday Tuesday Thursday Thursday Monday Thursday Thursday Monday Thursday Wednesday Sunday Tuesday Thursday Saturday Sunday Tuesday Tuesday Friday Thursday Thursday Saturday Saturday Tuesday Saturday Thursday Sunday Thursday Saturday Thursday Wednesday Monday Monday Thursday Saturday Tuesday Monday
-
Weighted Weekdays (Industry)
Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 4.645, Tuesday: 6.645, Wednesday: 11.23, Thursday: 6.1, Friday: 5.4, Saturday: 2.23, Sunday: 1.0. The days Monday to Friday are more likely to be generated than days on the weekend with Wednesday being the most likely.
Schema config for Weighted Weekdays (Industry)
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <!-- /******************************************************************************* * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved. * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. ******************************************************************************/ --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd"> <!-- All data is derived from this starting seed. If this seed is the same, the generated data will the same on each computer/node/platform. Change this seed to generate a different data set.--> <seed>1234567890L</seed> <rng name="PdgfDefaultRandom"/> <!--Default Scale factor for all tables --> <property name="SF" type="double">1</property> <table name="WEIGHTED_LIST_ITEM"> <!-- if tables should scale with -SF command line argument. Specify your scaling formula here: --> <size>50 * ${SF}</size> <!--Weighted Weekdays (Industry)--> <!-- Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 4.645, Tuesday: 6.645, Wednesday: 11.23, Thursday: 6.1, Friday: 5.4, Saturday: 2.23, Sunday: 1.0. The days Monday to Friday are more likely to be generated than days on the weekend with Wednesday being the most likely. --> <field name="weekday_busy_industry" size="" type="VARCHAR"> <gen_WeightedListItem filename="dicts/bigbench/ds-genProbabilities.txt" list="exampleList" valueColumn="day" weightColumn="busyDaysIndustry"/> </field> </table> </schema>
Output for Weighted Weekdays (Industry)
Tuesday Monday Tuesday Tuesday Friday Thursday Wednesday Friday Thursday Monday Tuesday Tuesday Thursday Monday Wednesday Wednesday Thursday Wednesday Thursday Monday Friday Wednesday Wednesday Wednesday Monday Monday Monday Tuesday Friday Saturday Friday Wednesday Sunday Thursday Thursday Monday Thursday Thursday Monday Thursday Wednesday Friday Monday Thursday Thursday Wednesday Monday Wednesday Thursday Friday
-
Weighted Weekdays (Restaurants)
Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1.4, Tuesday: 2.3, Wednesday: 3.0, Thursday: 3.1, Friday: 5.0, Saturday: 9.2, Sunday: 7.2. The weekdays Friday, Saturday, and Sunday are more likely to be generated than the other weekdays.
Schema config for Weighted Weekdays (Restaurants)
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <!-- /******************************************************************************* * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved. * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. ******************************************************************************/ --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd"> <!-- All data is derived from this starting seed. If this seed is the same, the generated data will the same on each computer/node/platform. Change this seed to generate a different data set.--> <seed>1234567890L</seed> <rng name="PdgfDefaultRandom"/> <!--Default Scale factor for all tables --> <property name="SF" type="double">1</property> <table name="WEIGHTED_LIST_ITEM"> <!-- if tables should scale with -SF command line argument. Specify your scaling formula here: --> <size>50 * ${SF}</size> <!--Weighted Weekdays (Restaurants)--> <!-- Generates weekdays (Monday, Tuesday, ..., Sunday) using the weighted list at 'dicts/bigbench/ds-genProbabilities.txt' with the following weights: Monday: 1.4, Tuesday: 2.3, Wednesday: 3.0, Thursday: 3.1, Friday: 5.0, Saturday: 9.2, Sunday: 7.2. The weekdays Friday, Saturday, and Sunday are more likely to be generated than the other weekdays. --> <field name="weekday_busy_restaurant" size="" type="VARCHAR"> <gen_WeightedListItem filename="dicts/bigbench/ds-genProbabilities.txt" list="exampleList" valueColumn="day" weightColumn="buysDaysRestaurants"/> </field> </table> </schema>
Output for Weighted Weekdays (Restaurants)
Saturday Wednesday Saturday Sunday Saturday Saturday Saturday Wednesday Friday Monday Sunday Tuesday Saturday Sunday Saturday Saturday Thursday Friday Thursday Sunday Sunday Saturday Sunday Wednesday Monday Saturday Sunday Thursday Tuesday Monday Thursday Friday Sunday Sunday Saturday Saturday Saturday Saturday Friday Thursday Saturday Tuesday Saturday Friday Tuesday Friday Saturday Thursday Wednesday Sunday
-
Weighted Ranges using Histogram
Generates long numbers using a given histogram at 'config/usecases/distinctExample-weightedLists.txt'. For each row the range (min and max) is dynamically chosen from the weighted list. In 60% of the choices the number will be between 1 and 9, in 30% of the choices between 20 and 50, and in 10% of the choices will be the range between 100 and 199. The numbers within the dynamically chosen range are generated uniformly. No that max value needs to use the 'sameChoiceAs' parameter in order to pick the same range as for the min value.
Schema config for Weighted Ranges using Histogram
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <!-- /******************************************************************************* * Copyright (c) 2013, bankmark and/or its affiliates. All rights reserved. * bankmark UG PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. ******************************************************************************/ --><schema xmlns:doc="http://bankmark.de/pdgf/doc" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" name="demo" xsi:noNamespaceSchemaLocation="structure/pdgfSchema.xsd"> <!-- All data is derived from this starting seed. If this seed is the same, the generated data will the same on each computer/node/platform. Change this seed to generate a different data set.--> <seed>1234567890L</seed> <rng name="PdgfDefaultRandom"/> <!--Default Scale factor for all tables --> <property name="SF" type="double">1</property> <table name="WEIGHTED_LIST_ITEM"> <!-- if tables should scale with -SF command line argument. Specify your scaling formula here: --> <size>50 * ${SF}</size> <!--Weighted Ranges using Histogram--> <!-- Generates long numbers using a given histogram at 'config/usecases/distinctExample-weightedLists.txt'. For each row the range (min and max) is dynamically chosen from the weighted list. In 60% of the choices the number will be between 1 and 9, in 30% of the choices between 20 and 50, and in 10% of the choices will be the range between 100 and 199. The numbers within the dynamically chosen range are generated uniformly. No that max value needs to use the 'sameChoiceAs' parameter in order to pick the same range as for the min value. --> <field name="long_number_by_histogram" size="" type="NUMERIC"> <gen_LongNumber> <min> <gen_WeightedListItem filename="config/usecases/distinctExample-weightedLists.txt" id="range" list="weighted_ranges_test2" valueColumn="min" weightColumn="weighted"/> </min> <max> <gen_WeightedListItem filename="config/usecases/distinctExample-weightedLists.txt" list="weighted_ranges_test2" valueColumn="max" weightColumn="weighted"> <sameChoiceAs field="long_number_by_histogram" generatorByID="range"/> </gen_WeightedListItem> </max> </gen_LongNumber> </field> </table> </schema>
Output for Weighted Ranges using Histogram
2 47 5 2 6 9 45 2 8 8 6 5 9 8 40 5 26 154 29 40 40 45 46 153 6 23 2 34 21 1 1 1 40 36 4 26 21 2 42 46 33 151 29 9 21 30 27 6 188 24