mirror of
https://github.com/php/pecl-search_engine-solr.git
synced 2026-03-23 22:52:07 +01:00
36 lines
1.1 KiB
PHP
36 lines
1.1 KiB
PHP
<?php
|
|
|
|
$extractParams = new SolrModifiableParams();
|
|
$extractParams
|
|
// index the document, using the unique ID: doc1
|
|
->set(SolrExtractRequest::LITERALS_PREFIX . 'id', 'doc1')
|
|
|
|
// capture what is inside paragraph tags
|
|
->set(SolrExtractRequest::CAPTURE_ELEMENTS, 'p')
|
|
|
|
// Indexes attributes of the Tika XHTML elements into separate fields
|
|
->set(SolrExtractRequest::CAPTURE_ATTRIBUTES, 'true')
|
|
|
|
// map p content to solr field
|
|
->set(
|
|
SolrExtractRequest::FIELD_MAPPING_PREFIX . 'p',
|
|
'an_indexed_field_name_that_holds_paragraphs'
|
|
)
|
|
|
|
// capture unmapped content here
|
|
->set(SolrExtractRequest::DEFAULT_FIELD, '__text__')
|
|
|
|
// restrict capturing to matching xpath expression
|
|
->set(
|
|
SolrExtractRequest::XPATH_EXPRESSION,
|
|
'/xhtml:html/xhtml:body/xhtml:div//node()'
|
|
)
|
|
;
|
|
|
|
$binContent = file_get_contents('somefile.pdf');
|
|
// please reference docs/documentation.php for the rest of the parameters
|
|
|
|
$extractRequest = SolrExtractRequest::createFromStream($binContent, 'application/pdf', $extractParams);
|
|
$response = $client->sendUpdateStream($extractRequest);
|
|
|