Synthetic data is valuable for testing data import, data manipulation, and data analysis programs and databases. This tutorial shows how to generate synthetic data in R and saves them as linked CSV and XML files.
drugs <- c('Xipramin','Colophrazen','Diaprogenix','Xinoprozen','Alaraphosol',
'Gerantrazeophem','Clobromizen','Bhiktarvizem')
cost.per.tablet <- c(0.72,1.23,0.04,2.82,0.92,1.87,1.44,3.87)
customers <- data.frame(
custName = c('Erat Pharma',
'Eleifend GMBH',
'Varius Plc',
'Luctus Aliquet Plc',
'Eu Dolor Companie',
'Lorem Luctus',
'At Pretium LLC',
'Enim PC',
'Adipiscing Mauris Inc.',
'Proin Dolor Institut',
'Nisl Quisque',
'Vitae Risus Incorporated',
'Plaxus Medical',
'Eastern Hospital Group'),
custCountry = c('Germany',
'Germany',
'Brazil',
'Brazil',
'Brazil',
'USA',
'USA',
'USA',
'USA',
'Germany',
'Brazil',
'USA',
'USA',
'USA'),
custRep = c(100,
100,
887,
887,
887,
332,
332,
203,
203,
221,
887,
203,
119,
119)
)
df.Reps <- data.frame(repID = c(100,887,332,203,221,655,119,988),
repFN = c('Helmut','Walison','Lynette','Aneeta','Veronika','Ralph','Prasad','Xi'),
repLN = c('Schwab','da Silva','McRowe','Kappoorthy','Sixt','Klinger','Patel','Zheng'),
repTR = c('EMEA','South America',
'East','West','EMEA',
'West','EMEA','EMEA'))
Generate sales transactions.
numTxns <- 100
df.Sales <- data.frame(
txnID = (1:numTxns) + 1000,
date = vector(mode = "character", numTxns),
cust = vector(mode = "character", numTxns),
prod = vector(mode = "character", numTxns),
qty = vector(mode = "numeric", numTxns),
amount = vector(mode = "numeric", numTxns),
country = vector(mode = "character", numTxns),
repID = vector(mode = "numeric", numTxns),
row.names = NULL
)
yearsMin <- 2020
yearsMax <- 2022
for (t in 1:numTxns)
{
# generate date
month <- round(runif(1, min = 1, max = 12),0)
day <- round(runif(1, min = 1, max = 28),0)
year <- round(runif(1, min = yearsMin, max = yearsMin),0)
date <- paste0(month, '/', day, '/', year)
df.Sales$date[t] <- date
# generate product info
prodIndex <- round(runif(1, min = 1, max = (length(drugs))), 0)
df.Sales$prod[t] <- drugs[prodIndex]
df.Sales$qty[t] <- round(runif(1, min = 1, max = 20), 0) * 100
df.Sales$amount[t] <- df.Sales$qty[t] * cost.per.tablet[prodIndex]
# generate customer info
custIndex <- round(runif(1, min = 1, max = (nrow(customers))), 0)
df.Sales$cust[t] <- customers$custName[custIndex]
df.Sales$repID[t] <- customers$custRep[custIndex]
df.Sales$country[t] <- customers$custCountry[custIndex]
}
xml.fn <- "pharmaSalesTxn.xml"
xml <- '<?xml version="1.0" encoding="UTF-8"?>\n\n'
xml <- paste0(xml, '<txns>', '\n')
for (r in 1:nrow(df.Sales))
{
xml <- paste0(xml, ' <txn>', '\n')
for (c in names(df.Sales))
{
xml <- paste0(xml, ' ', '<', c, '>')
xml <- paste0(xml, df.Sales[r,c])
xml <- paste0(xml, '</', c, '>', '\n')
}
xml <- paste0(xml, ' </txn>', '\n')
}
xml <- paste0(xml, '</txns>')
conn <- file(xml.fn)
writeLines(xml, conn)
xml.fn <- "pharmaReps.xml"
xml <- '<?xml version="1.0" encoding="UTF-8"?>\n\n'
xml <- paste0(xml, '<salesteam>', '\n')
for (r in 1:nrow(df.Reps))
{
xml <- paste0(xml, ' <rep ', 'rID="r', df.Reps[r,1], '">\n')
xml <- paste0(xml, ' <firstName>', df.Reps[r,2], '</firstName>\n')
xml <- paste0(xml, ' <lastName>', df.Reps[r,3], '</lastName>\n')
xml <- paste0(xml, ' <territory>', df.Reps[r,4], '</territory>\n')
xml <- paste0(xml, ' </rep>', '\n')
}
xml <- paste0(xml, '</salesteam>')
conn <- file(xml.fn)
writeLines(xml, conn)
This tutorial provided an example on how to generate synthetic data as CSV and XML files.
No references.