## Loading required package: R.utils
## Loading required package: R.oo
## Warning: package 'R.oo' was built under R version 4.3.2
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.2 (2022-06-13 22:00:14 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.26.0 (2024-01-24 05:12:50 UTC) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
## 
##     throw
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save
## R.utils v2.12.3 (2023-11-18 01:00:02 UTC) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
## 
##     timestamp
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, isOpen, nullfile, parse, warnings
## Loading required package: xml2
## Loading required package: xslt

Introduction

Apply Transformation

library(xml2)
library(xslt)
xslFn <- "xsl/PubMed2PMSubset.xsl"

xmlfolder <- "pubmed-xml" 
tfmfolder <- "pubmed-xml-tfm"

xmlfiles <- list.files(path = xmlfolder, pattern = ".xml$", full.names = T)
numXMLFiles <- length(xmlfiles)

for (i in 1:numXMLFiles) {
  # apply transform to each file
  xmlDoc <- read_xml(xmlfiles[i])
  xslStyle <- read_xml(xslFn)

  transformedXML <- xml_xslt(xmlDoc, xslStyle)

  xmlFnBase <- basename(xmlfiles[i])
  
  txnFn <- paste0(tfmfolder,'/',
                  substring(xmlFnBase, 1, nchar(xmlFnBase)-4),
                  '-tf.xml')
  
  status <- write_xml(transformedXML, file = txnFn)
}
xmlDoc <- xmlParse("pubmed-xml-tfm/pubmed22n0007-tf.xml")
root <- xmlRoot(xmlDoc)

length(xmlChildren(root))
## [1] 30000
journals = xpathApply(xmlDoc, "//Journal/Title", xmlValue)

Conclusion

This tutorial provided an example on how to extract data from an XML file into a new XML file with a different structure using XSLT.

Tutorial


Files & Resources

All Files for Lesson 6.183

References

No references.

Errata

None collected yet. Let us know.

LS0tCnRpdGxlOiAiRXh0cmFjdCBhbmQgVHJhbnNmb3JtIFB1Yk1lZCBYTUwgRGF0YSB1c2luZyBYU0xUIgpwYXJhbXM6CiAgY2F0ZWdvcnk6IDYKICBudW1iZXI6IDE4MwogIHRpbWU6IDMwCiAgbGV2ZWw6IGJlZ2lubmVyCiAgdGFnczogInIseHBhdGgseG1sIgogIGRlc2NyaXB0aW9uOiAiU2hvd3MgaG93IHRvIGV4dHJhY3QgYSBzdWJzZXQgb2YgdGhlIGRhdGEgaW4gdGhlIFB1Yk1lZAogICAgICAgICAgICAgICAgZGF0YSBzZXQgaW50byBhIG5ldyBYTUwgd2l0aCBhIGRpZmZlcmVudCBzdHJ1Y3R1cmUgdXNpbmcKICAgICAgICAgICAgICAgIFhTTFQuIgpkYXRlOiAiPHNtYWxsPmByIFN5cy5EYXRlKClgPC9zbWFsbD4iCmF1dGhvcjogIjxzbWFsbD5NYXJ0aW4gU2NoZWRsYmF1ZXI8L3NtYWxsPiIKZW1haWw6ICJtLnNjaGVkbGJhdWVyQG5ldS5lZHUiCmFmZmlsaXRhdGlvbjogIk5vcnRoZWFzdGVybiBVbml2ZXJzaXR5IgpvdXRwdXQ6IAogIGJvb2tkb3duOjpodG1sX2RvY3VtZW50MjoKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCiAgICBjb2xsYXBzZWQ6IGZhbHNlCiAgICBudW1iZXJfc2VjdGlvbnM6IGZhbHNlCiAgICBjb2RlX2Rvd25sb2FkOiB0cnVlCiAgICB0aGVtZTogc3BhY2VsYWIKICAgIGhpZ2hsaWdodDogdGFuZ28KLS0tCgotLS0KdGl0bGU6ICI8c21hbGw+YHIgcGFyYW1zJGNhdGVnb3J5YC5gciBwYXJhbXMkbnVtYmVyYDwvc21hbGw+PGJyLz48c3BhbiBzdHlsZT0nY29sb3I6ICMyRTQwNTM7IGZvbnQtc2l6ZTogMC45ZW0nPmByIHJtYXJrZG93bjo6bWV0YWRhdGEkdGl0bGVgPC9zcGFuPiIKLS0tCgpgYGB7ciBjb2RlPXhmdW46OnJlYWRfdXRmOChwYXN0ZTAoaGVyZTo6aGVyZSgpLCcvUi9faW5zZXJ0MkRCLlInKSksIGluY2x1ZGUgPSBGQUxTRX0KYGBgCgpgYGB7ciBpbnN0YWxsQW5kTG9hZFBhY2thZ2VzLCB3YXJuaW5ncz1GLCBlY2hvPUZ9CnBhY2thZ2VzID0gYygiUi51dGlscyIsICJ4bWwyIiwKICAgICAgICAgICAgICJ4c2x0IiwgIlhNTCIpCgojIyBOb3cgbG9hZCBvciBpbnN0YWxsJmxvYWQgYWxsCnBhY2thZ2UuY2hlY2sgPC0gbGFwcGx5KAogIHBhY2thZ2VzLAogIEZVTiA9IGZ1bmN0aW9uKHgpIHsKICAgIGlmICghcmVxdWlyZSh4LCBjaGFyYWN0ZXIub25seSA9IFRSVUUpKSB7CiAgICAgIGluc3RhbGwucGFja2FnZXMoeCwgZGVwZW5kZW5jaWVzID0gVFJVRSwgcmVwb3M9J2h0dHA6Ly9jcmFuLnVzLnItcHJvamVjdC5vcmcnKQogICAgICAjbGlicmFyeSh4LCBjaGFyYWN0ZXIub25seSA9IFRSVUUpCiAgICB9CiAgfQopCmBgYAoKYGBge3IgdW5aaXBQdWJNZWREYXRhRmlsZXMsIGV2YWw9RiwgZWNobz1GfQojIHB1Ym1lZCBkYXRhIGZpbGVzIGFyZSAuZ3ogZmlsZXMgLS0gdW5jb21wcmVzcyBwcmlvciB0byB1c2UKbGlicmFyeShSLnV0aWxzKQoKIyB1bmNvbXByZXNzIGFsbCBmaWxlcyBpbiBmb2xkZXIKZm9sZGVyIDwtICJwdWJtZWQteG1sIiAKCmd6ZmlsZXMgPC0gbGlzdC5maWxlcyhwYXRoID0gZm9sZGVyLCBwYXR0ZXJuID0gIiouZ3oiLCBmdWxsLm5hbWVzID0gVCkKCmZvciAoaSBpbiAxOmxlbmd0aChnemZpbGVzKSkgewogIGd1bnppcChnemZpbGVzW2ldLCByZW1vdmU9RkFMU0UsIHNraXA9VFJVRSkKfQpgYGAKCiMjIEludHJvZHVjdGlvbgoKIyMgQXBwbHkgVHJhbnNmb3JtYXRpb24KCmBgYHtyfQpsaWJyYXJ5KHhtbDIpCmxpYnJhcnkoeHNsdCkKYGBgCgpgYGB7cn0KeHNsRm4gPC0gInhzbC9QdWJNZWQyUE1TdWJzZXQueHNsIgoKeG1sZm9sZGVyIDwtICJwdWJtZWQteG1sIiAKdGZtZm9sZGVyIDwtICJwdWJtZWQteG1sLXRmbSIKCnhtbGZpbGVzIDwtIGxpc3QuZmlsZXMocGF0aCA9IHhtbGZvbGRlciwgcGF0dGVybiA9ICIueG1sJCIsIGZ1bGwubmFtZXMgPSBUKQpudW1YTUxGaWxlcyA8LSBsZW5ndGgoeG1sZmlsZXMpCgpmb3IgKGkgaW4gMTpudW1YTUxGaWxlcykgewogICMgYXBwbHkgdHJhbnNmb3JtIHRvIGVhY2ggZmlsZQogIHhtbERvYyA8LSByZWFkX3htbCh4bWxmaWxlc1tpXSkKICB4c2xTdHlsZSA8LSByZWFkX3htbCh4c2xGbikKCiAgdHJhbnNmb3JtZWRYTUwgPC0geG1sX3hzbHQoeG1sRG9jLCB4c2xTdHlsZSkKCiAgeG1sRm5CYXNlIDwtIGJhc2VuYW1lKHhtbGZpbGVzW2ldKQogIAogIHR4bkZuIDwtIHBhc3RlMCh0Zm1mb2xkZXIsJy8nLAogICAgICAgICAgICAgICAgICBzdWJzdHJpbmcoeG1sRm5CYXNlLCAxLCBuY2hhcih4bWxGbkJhc2UpLTQpLAogICAgICAgICAgICAgICAgICAnLXRmLnhtbCcpCiAgCiAgc3RhdHVzIDwtIHdyaXRlX3htbCh0cmFuc2Zvcm1lZFhNTCwgZmlsZSA9IHR4bkZuKQp9CmBgYAoKYGBge3IgdGVzdFhNTH0KeG1sRG9jIDwtIHhtbFBhcnNlKCJwdWJtZWQteG1sLXRmbS9wdWJtZWQyMm4wMDA3LXRmLnhtbCIpCmBgYAoKYGBge3J9CnJvb3QgPC0geG1sUm9vdCh4bWxEb2MpCgpsZW5ndGgoeG1sQ2hpbGRyZW4ocm9vdCkpCmBgYAoKYGBge3IgY2hlY2tXaXRoWFBhdGh9CmpvdXJuYWxzID0geHBhdGhBcHBseSh4bWxEb2MsICIvL0pvdXJuYWwvVGl0bGUiLCB4bWxWYWx1ZSkKYGBgCgojIyBDb25jbHVzaW9uCgpUaGlzIHR1dG9yaWFsIHByb3ZpZGVkIGFuIGV4YW1wbGUgb24gaG93IHRvIGV4dHJhY3QgZGF0YSBmcm9tIGFuIFhNTCBmaWxlIGludG8gYSBuZXcgWE1MIGZpbGUgd2l0aCBhIGRpZmZlcmVudCBzdHJ1Y3R1cmUgdXNpbmcgWFNMVC4KCiMjIFR1dG9yaWFsCgpgYGB7PWh0bWx9CjxpZnJhbWUgc3JjPSIiIHdpZHRoPSI0ODAiIGhlaWdodD0iMjcwIiBmcmFtZWJvcmRlcj0iMCIgYWxsb3c9ImF1dG9wbGF5OyBmdWxsc2NyZWVuOyBwaWN0dXJlLWluLXBpY3R1cmUiIGFsbG93ZnVsbHNjcmVlbiBkYXRhLWV4dGVybmFsPSIxIj48L2lmcmFtZT4KYGBgCgotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0KCiMjIEZpbGVzICYgUmVzb3VyY2VzCgpgYGB7ciB6aXBGaWxlcywgZWNobz1GQUxTRX0KemlwTmFtZSA9IHNwcmludGYoIkxlc3NvbkZpbGVzLSVzLSVzLnppcCIsIAogICAgICAgICAgICAgICAgIHBhcmFtcyRjYXRlZ29yeSwKICAgICAgICAgICAgICAgICBwYXJhbXMkbnVtYmVyKQoKdGV4dEFMaW5rID0gcGFzdGUwKCJBbGwgRmlsZXMgZm9yIExlc3NvbiAiLCAKICAgICAgICAgICAgICAgcGFyYW1zJGNhdGVnb3J5LCIuIixwYXJhbXMkbnVtYmVyKQoKIyBkb3dubG9hZEZpbGVzTGluaygpIGlzIGluY2x1ZGVkIGZyb20gX2luc2VydDJEQi5SCmtuaXRyOjpyYXdfaHRtbChkb3dubG9hZEZpbGVzTGluaygiLiIsIHppcE5hbWUsIHRleHRBTGluaykpCmBgYAoKLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCgojIyBSZWZlcmVuY2VzCgpObyByZWZlcmVuY2VzLgoKIyMgRXJyYXRhCgpOb25lIGNvbGxlY3RlZCB5ZXQuIExldCB1cyBrbm93LgoKYGBgez1odG1sfQo8c2NyaXB0IHNyYz0iaHR0cHM6Ly9mb3JtLmpvdGZvcm0uY29tL3N0YXRpYy9mZWVkYmFjazIuanMiIHR5cGU9InRleHQvamF2YXNjcmlwdCI+CiAgbmV3IEpvdGZvcm1GZWVkYmFjayh7CiAgICBmb3JtSWQ6ICIyMTIxODcwNzI3ODQxNTciLAogICAgYnV0dG9uVGV4dDogIkZlZWRiYWNrIiwKICAgIGJhc2U6ICJodHRwczovL2Zvcm0uam90Zm9ybS5jb20vIiwKICAgIGJhY2tncm91bmQ6ICIjRjU5MjAyIiwKICAgIGZvbnRDb2xvcjogIiNGRkZGRkYiLAogICAgYnV0dG9uU2lkZTogImxlZnQiLAogICAgYnV0dG9uQWxpZ246ICJjZW50ZXIiLAogICAgdHlwZTogZmFsc2UsCiAgICB3aWR0aDogNzAwLAogICAgaGVpZ2h0OiA1MDAsCiAgICBpc0NhcmRGb3JtOiBmYWxzZQogIH0pOwo8L3NjcmlwdD4KYGBgCmBgYHtyIGNvZGU9eGZ1bjo6cmVhZF91dGY4KHBhc3RlMChoZXJlOjpoZXJlKCksJy9SL19kZXBsb3lLbml0LlInKSksIGluY2x1ZGUgPSBGQUxTRX0KYGBgCg==