int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::ifstream("data_processing_definition.docx", std::ios_base::binary) | content_type::detector{} | office_formats_parser{} | HtmlExporter() | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(out_stream.str() ==
"\n"
"\n"
"\n"
"\n"
"DocWire\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"\n"
"Data processing refers to the activities performed on raw data to convert it into meaningful information. It involves collecting, organizing, analyzing, and interpreting data to extract useful insights and support decision-making. This can include tasks such as sorting, filtering, summarizing, and transforming data through various computational and statistical methods.
Data processing is essential in various fields, including business, science, and technology, as it enables organizations to derive valuable knowledge from large datasets, make informed decisions, and improve overall efficiency.
\n"
"\n");
return 0;
}
```
Parse all files in any format inside archives (ZIP, TAR, RAR, GZ, BZ2, XZ) recursively:
```cpp
#include "docwire.h"
int main(int argc, char* argv[])
{
using namespace docwire;
try
{
std::filesystem::path("test.zip") | DecompressArchives() | content_type::detector{} | office_formats_parser{} | OCRParser{} | PlainTextExporter() | std::cout;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
return 0;
}
```
Classify file in any format (Office, PDF, mail, etc) to any categories using build-in local AI model:
```cpp
#include "docwire.h"
#include
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("document_processing_market_trends.odt") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | local_ai::model_chain_element("Classify to one of the following categories and answer with exact category name: agreement, invoice, report, legal, user manual, other:\n\n", std::make_shared()) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(out_stream.str() == "report");
return 0;
}
```
Classify file in any format (Office, PDF, mail, etc) to any categories using OpenAI service:
```cpp
#include "docwire.h"
#include
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("document_processing_market_trends.odt") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | openai::Classify({ "agreement", "invoice", "report", "legal", "other"}, std::getenv("OPENAI_API_KEY")) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(out_stream.str() == "report\n");
return 0;
}
```
Translate document in any format (Office, PDF, mail, etc) to other language using build-in local AI model:
```cpp
#include "docwire.h"
#include
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("data_processing_definition.doc") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | local_ai::model_chain_element("Translate to spanish:\n\n", std::make_shared()) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(fuzzy_match::ratio(out_stream.str(), "La procesación de datos se refiere a las actividades realizadas en el ámbito de los datos en materia de información. Se trata de recoger, organizar, analizar y interpretar los datos para extraer inteligencias y apoyar el procesamiento de decisión. Esto puede incluir tareas como la etiqueta, la filtración, la summarización y la transformación de los datos a través de diversos métodos compuestos y estadounidenses. El procesamiento de datos es esencial en diversos ámbitos, incluyendo el negocio, la ciencia y la tecnologàa, pues permite a las empresas a extraer conocimientos valiosos de grans de datos, hacer decisiones indicadas y mejorar la eficiencia global.") > 80);
return 0;
}
```
Translate document in any format (Office, PDF, mail, etc) to other language using OpenAI service:
```cpp
#include "docwire.h"
#include
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("data_processing_definition.doc") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | openai::TranslateTo("spanish", std::getenv("OPENAI_API_KEY")) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(out_stream.str() == "El procesamiento de datos se refiere a las actividades realizadas en datos crudos para convertirlos en información significativa. Implica recolectar, organizar, analizar e interpretar datos para extraer ideas útiles y apoyar la toma de decisiones. Esto puede incluir tareas como ordenar, filtrar, resumir y transformar datos a través de varios métodos computacionales y estadísticos. El procesamiento de datos es esencial en varios campos, incluyendo negocios, ciencia y tecnología, ya que permite a las organizaciones obtener conocimientos valiosos de grandes conjuntos de datos, tomar decisiones informadas y mejorar la eficiencia general.\n");
return 0;
}
```
Detect sentiment of document in any format (Office, PDF, mail, etc) using build-in local AI model:
```cpp
#include "docwire.h"
#include
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("data_processing_definition.doc") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | local_ai::model_chain_element("Detect sentiment:\n\n", std::make_shared()) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(out_stream.str() == "positive");
return 0;
}
```
Detect sentiment of document in any format (Office, PDF, mail, etc) using newest GPT-4 Turbo model with 128K context:
```cpp
#include "docwire.h"
#include
int main(int argc, char* argv[])
{
using namespace docwire;
try
{
std::filesystem::path("1.doc") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | openai::DetectSentiment(std::getenv("OPENAI_API_KEY"), openai::Model::gpt4_turbo_preview) | std::cout;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
return 0;
}
```
Make a summary of document in any format (Office, PDF, mail, etc) using build-in local AI model:
```cpp
#include "docwire.h"
#include
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("data_processing_definition.doc") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | local_ai::model_chain_element("Write a short summary for this text:\n\n", std::make_shared()) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(out_stream.str() == "Data processing is the process of transforming raw data into meaningful information.");
return 0;
}
```
Make a voice summary of document in any format (Office, PDF, mail, etc) in two steps: summarize using GPT model and convert the summary to speech using text to speech model. Result is saved to mp3 file:
```cpp
#include "docwire.h"
#include
int main(int argc, char* argv[])
{
using namespace docwire;
try
{
std::filesystem::path("1.doc") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | openai::Summarize(std::getenv("OPENAI_API_KEY")) | openai::TextToSpeech(std::getenv("OPENAI_API_KEY")) | std::ofstream("summary.mp3");
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
return 0;
}
```
Make a text summary of voice recording (e.g. mp3 file with meeting recording) in two steps: convert voice to text using Whisper-1 model and summarize text using GPT model:
```cpp
#include "docwire.h"
#include
int main(int argc, char* argv[])
{
using namespace docwire;
try
{
std::filesystem::path("data_processing_definition.mp3") | openai::Transcribe(std::getenv("OPENAI_API_KEY")) | PlainTextExporter() | openai::Summarize(std::getenv("OPENAI_API_KEY")) | std::cout;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
return 0;
}
```
Find phrases, objects and events with smart matching in documents in any format (Office, PDF, mail, etc) using build-in local AI model:
```cpp
#include "docwire.h"
#include
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("data_processing_definition.doc") | content_type::detector{} | office_formats_parser{} | PlainTextExporter() | local_ai::model_chain_element("Find sentence about \"data convertion\" in the following text:\n\n", std::make_shared()) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
assert(out_stream.str() == "Data processing refers to the activities performed on raw data to convert it into meaningful information.");
return 0;
}
```
Find phrases, objects and events in text or image using GPT model:
```cpp
#include "docwire.h"
#include
int main(int argc, char* argv[])
{
using namespace docwire;
std::stringstream out_stream;
try
{
std::filesystem::path("scene_1.png") | openai::Find("car", std::getenv("OPENAI_API_KEY"), openai::Model::gpt4_vision_preview, 0, openai::ImageDetail::low) | out_stream;
std::filesystem::path("scene_1.png") | openai::Find("person", std::getenv("OPENAI_API_KEY"), openai::Model::gpt4_vision_preview, 0, openai::ImageDetail::low) | out_stream;
std::filesystem::path("scene_1.png") | openai::Find("running", std::getenv("OPENAI_API_KEY"), openai::Model::gpt4_vision_preview, 0, openai::ImageDetail::low) | out_stream;
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
return 0;
}
```
Reusing single parsing chain to parse multiple input files:
```cpp
#include "docwire.h"
#include
int main(int argc, char* argv[])
{
using namespace docwire;
try
{
auto chain = content_type::detector{} | office_formats_parser{} | PlainTextExporter() | std::cout; // create a chain of steps to parse a file
for (int i = 1; i < 3; ++i)
std::ifstream(std::to_string(i) + ".docx", std::ios_base::binary) | chain; // set the input file as an input stream
}
catch (const std::exception& e)
{
std::cerr << errors::diagnostic_message(e) << std::endl;
return 1;
}
return 0;
}
```
Handling errors and warnings:
[You can find example of handling errors and warnings here](https://docwire.readthedocs.io/en/latest/handling_errors_and_warnings_8cpp-example.html)
Using transformer to filter out emails (eg. from Outlook PST mailbox) with subject containing "Hello":
```cpp
#include "docwire.h"
int main(int argc, char* argv[])
{
using namespace docwire;
std::filesystem::path("1.pst") | content_type::detector{} |
mail_parser{} | office_formats_parser{}
| [](Info &info) // Create an input from file path, parser and connect them to transformer
{
if (std::holds_alternative(info.tag)) // if current node is mail
{
auto subject = std::get(info.tag).subject; // get the subject attribute
if (subject) // if subject attribute exists
{
if (subject->find("Hello") != std::string::npos) // if subject contains "Hello"
{
info.skip = true; // skip the current node
}
}
}
}
| PlainTextExporter() // sets exporter to plain text
| std::cout;
return 0;
}
```

Joining transformers to filter out emails (eg. from Outlook PST mailbox) with subject "Hello" and limit the number of mails to 10:
```cpp
#include "docwire.h"
int main(int argc, char* argv[])
{
using namespace docwire;
std::filesystem::path("1.pst") | content_type::detector{} |
mail_parser{} | office_formats_parser{} |
[](Info &info) // Create an input from file path, parser and connect them to transformer
{
if (std::holds_alternative(info.tag)) // if current node is mail
{
auto subject = std::get(info.tag).subject; // get the subject attribute
if (subject) // if subject attribute exists
{
if (subject->find("Hello") != std::string::npos) // if subject contains "Hello"
{
info.skip = true; // skip the current node
}
}
}
} |
[counter = 0, max_mails = 1](Info &info) mutable // Create a transformer and connect it to previous transformer
{
if (std::holds_alternative(info.tag)) // if current node is mail
{
if (++counter > max_mails) // if counter is greater than max_mails
{
info.cancel = true; // cancel the parsing process
}
}
} |
PlainTextExporter() | // sets exporter to plain text
std::cout;
return 0;
}
```
## Awards
- SourceForge Community Choice (2023) - project has qualified for this award out of over 500,000 open source projects on SourceForge
- Microsoft for Startups grant (2022) - project was selected by Microsoft to accelerate its grow by providing Expert Guidance, development tools, Azure and GitHub cloud infrastructure and OpenAI machine learning utilities
## Installation
### Why Choose vcpkg for DocWire SDK Installation?
DocWire has embraced vcpkg as the preferred installation method for several compelling reasons:
- **Microsoft's Trusted Solution:** vcpkg is a package manager developed and backed by Microsoft, ensuring reliability, ongoing support, and compatibility.
- **Cross-Platform Simplicity:** With vcpkg, DocWire installation becomes a breeze on Windows, Linux, and macOS, providing a unified and hassle-free experience.
- **Effortless Dependency Management:** vcpkg takes care of resolving and installing dependencies automatically, reducing manual configuration efforts for developers.
- **Swift Binary Package Deployment:** The availability of pre-built binary packages in vcpkg accelerates installation, minimizing the need for time-consuming manual compilation.
- **Seamless CMake Integration:** vcpkg seamlessly integrates with the CMake build system, simplifying the incorporation of DocWire into CMake-based projects.
By selecting vcpkg, DocWire ensures that programmers benefit from a trusted, user-friendly, and well-supported solution that guarantees a smooth installation experience.
### Supported Platforms
DocWire SDK is compatible with a variety of operating systems. Windows, Linux, and macOS are supported officially (Supported triplets are: x64-linux-dynamic, x64-windows, x64-osx-dynamic and arm64-osx-dynamic). but in theory it can be run on other operating systems as well. To ensure compatibility our continuous integration tests run on the following GitHub runners:
- [ubuntu-24.04](https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2404-Readme.md)
- [ubuntu-22.04](https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2204-Readme.md)
- [ubuntu-20.04](https://github.com/actions/runner-images/blob/main/images/ubuntu/Ubuntu2004-Readme.md) with gcc upgraded to version 11
- [windows-2022](https://github.com/actions/runner-images/blob/main/images/windows/Windows2022-Readme.md)
- [windows-2019](https://github.com/actions/runner-images/blob/main/images/windows/Windows2019-Readme.md)
- [macos-15](https://github.com/actions/runner-images/blob/main/images/macos/macos-15-Readme.md)
- [macos-14](https://github.com/actions/runner-images/blob/main/images/macos/macos-14-Readme.md)
- [macos-13](https://github.com/actions/runner-images/blob/main/images/macos/macos-13-Readme.md)
As the project evolves, we will continue to expand the list of officially supported platforms to ensure broad compatibility and meet the needs of our users.
### Required Tools
DocWire SDK installation process is based on the VCPKG package manager so all requirements of vcpkg apply:
- ["Prerequisites" on Microsoft Learn](https://learn.microsoft.com/en-us/vcpkg/get_started/get-started#prerequisites)
In addition to vcpkg requirements, DocWire SDK requires the following tools:
- GCC 11 or higher on Linux because of C++20 support
- MSVC 2019 or higher on Windows because of C++20 support
- Doxygen with Graphviz is required for documentation generation: ["Installation" on Doxygen website](https://www.doxygen.nl/manual/install.html)
- Autoconf, Autogen, Automake, Autoconf-archive, Autopoint are required for building some of 3rdparty dependencies on Linux and MacOS (we are working to eliminate this)
### Installation via build.sh or build.ps1 scripts
The current preferred installation method is via build.sh or build.ps1 scripts. The script will automatically install vcpkg, select correct triplet, add required overlays and install DocWire SDK with all dependencies from sources.
1. **Clone the DocWire Repository:**
Clone the DocWire repository from GitHub if you haven't already:
```
git clone https://github.com/docwire/docwire.git
```
2. **Run build.sh (Linux, MacOS) or build.ps1 (Windows):**
- Linux, MacOS:
```
cd docwire
./build.sh
```
- Windows:
```
cd docwire
./build.ps1
```
**Important note**: Sometimes installation of dependencies may fail due to various reasons. Some of those reasons are not related to the DocWire SDK itself, but rather to the vcpkg package manager or the specific dependencies being installed. For example, errors may occur while downloading sources or if there are unexpected changes in vcpkg ports or upstream websites. In such cases, it is recommended to [report the issue to the vcpkg](https://github.com/microsoft/vcpkg/issues) or just try running the installation script again. In rare cases, some dependencies may require manual intervention or additional configuration before they can be successfully installed. If you encounter any issues during the installation process, please don't hesitate to reach out to the [DocWire SDK community for support](https://github.com/docwire/docwire/issues).
3. **Integrate with your project or development environment**
You can use vcpkg toolchain file to integrate DocWire SDK with your CMake project:
```
cmake -DCMAKE_TOOLCHAIN_FILE=docwire/vcpkg/scripts/buildsystems/vcpkg.cmake ..
```
Please refer to ["vcpkg in CMake projects" on Microsoft Learn](https://learn.microsoft.com/en-us/vcpkg/users/buildsystems/cmake-integration) for more information on how to use vcpkg with your CMake project.
To use DocWire SDK in your MSBuild projects (Visual Studio) run the following command:
```
cd docwire/vcpkg
vcpkg integrate install
```
This automatically adds installed packages to the following project properties: Include Directories, Link Directories, and Link Libraries. Additionally, it creates a post-build action that ensures that any required DLLs are copied into the build output folder.
Please refer to ["vcpkg in MSBuild projects" on Microsoft Learn](https://learn.microsoft.com/en-us/vcpkg/users/buildsystems/msbuild-integration) for more information on how to use vcpkg with your MSBuild project.
For other building systems check your build system specific documentation for how to use prebuilt binaries.
Please refer to ["Manual Integration" on Microsoft Learn](https://learn.microsoft.com/en-us/vcpkg/users/buildsystems/manual-integration) for more information.
### Installation in preexisting vcpkg instance
You need to do the configuration, installation and integration manually. Please follow recommendations in vcpkg documentation and check content of build.sh or build.ps1 script for details.
Required overlays are located in "ports" subdirectory.
### Pre-built binaries (binary cache)
Vcpkg builds libraries from source but offers an option to store the results of the build process in a binary cache. This allows other developer machines or continuous integration runs to reference these prebuilt packages without running a new build every time. By using a binary cache, vcpkg can detect if a rebuild is necessary by checking if the cache already contains a valid existing package with appropriate binaries.
Please refer ["What is binary caching?" on Microsoft Learn](https://learn.microsoft.com/en-us/vcpkg/consume/binary-caching-overview) for more information.
We used to provide pre-built binaries but we realized that they are not reliable as they are tightly coupled with specific versions of compiler, operating system and other dependencies. To solve this issue, our continuous integration scripts use vcpkg binary caching with GitHub packages and we are working to offer this to other users. This will allow everyone to use the latest version of DocWire SDK without having to worry about compatibility.
### Conclusion
You're all set! You've successfully installed the DocWire library using vcpkg. You can now use the DocWire library in your code to perform text extraction and other data processing tasks.
## Versioning
DocWire SDK introduces a distinctive versioning methodology, deviating from conventional semantic versioning (SemVer) in favor of a dynamic, date-centric system. This chapter elucidates the versioning strategy of DocWire SDK, shedding light on its unique attributes and the benefits derived from this innovative approach.
### "Release Early, Release Often" Strategy with Date-Based Versioning
DocWire SDK's versioning system adopts the "release early, release often" concept, ensuring that users benefit from the latest features and improvements as soon as they are available. This approach minimizes the usage of outdated code, ensuring that resources are directed towards delivering the best possible user experience.
DocWire SDK's versioning system integrates release dates as a pivotal element. Each release is identified by a specific date, providing users with a transparent timeline of updates and enhancements. In most cases new release is created immediately after merging of new feature or important bug fix to the main branch.
The "release early, release often" strategy is based on the idea that releasing code frequently allows for faster resolution of issues, faster feedback from users, and enables developers to incorporate community feedback more effectively. By releasing code more frequently, users benefit from the latest changes and features, enabling them to stay up-to-date with the SDK's evolution.
This approach is in line with industry best practices and aligns with the "live at the head" concept, providing users with a transparent timeline of updates and enhancements. This approach ensures clarity and enables users to comprehend the evolution of the SDK over time.
### Advantages of "Release Early, Release Often"
#### Continuous Evolution
The "release early, release often" strategy fosters a model of continuous evolution, allowing for swift development and deployment of new features. This ensures that users have timely access to the latest advancements and can choose to stay current or opt for specific versions tailored to their requirements.
#### Improved Collaboration
Transparent and chronological versioning facilitates collaboration among developers, contributors, and users. This inclusive model enables everyone to track the SDK's progress, understand the sequence of changes, and contribute to discussions around specific releases, fostering a sense of community and shared ownership.
#### Rapid Issue Resolution
The "release early, release often" concept accelerates issue resolution by providing prompt access to bug fixes and improvements. This agile approach ensures that users encounter fewer obstacles, leading to a more responsive and satisfying experience.
#### User-Centric Updates
DocWire SDK's versioning approach prioritizes user-centric updates, enabling developers to make informed decisions about when to integrate the latest changes into their projects. This flexibility empowers users to tailor their SDK experience based on specific features or fixes introduced in each release.
### Codebase Strategy
DocWire SDK maintains a single code branch, focusing on continuous improvement of the API and staying up-to-date with integrated external services. This strategy prioritizes innovation over stability, ensuring that resources are directed towards enhancing the SDK rather than maintaining outdated code branches. This approach aligns with industry best practices and optimizes the SDK for evolving technology landscapes.
### Long-Term Support (LTS) Agreements
Recognizing the diverse needs of our users, DocWire SDK offers Long-Term Support (LTS) agreements, providing a tailored solution for those seeking sustained stability and reliability. The LTS agreement is designed to address specific concerns related to feature stability, API consistency, and platform support, offering peace of mind as projects progress and evolve.
#### Customized Stability
With an LTS agreement, customers can select specific features or aspects of the API that are crucial to their projects. This ensures that the chosen features remain stable and will not be subject to unexpected changes or removals as the SDK continues to evolve. Customized stability empowers users to build and maintain applications with confidence, knowing that the core functionalities they rely on will remain consistent over time.
#### API Consistency
For projects that demand a consistent API, the LTS agreement provides assurance that the API's core elements will remain unchanged throughout the agreed-upon support period. This commitment to API consistency enables developers to build and maintain applications with minimal disruptions, fostering a stable and reliable development environment.
#### Platform Support Assurance
The LTS agreement extends to platform support, offering a guarantee that the SDK will continue to support selected platforms. This is particularly beneficial for projects with specific platform dependencies, ensuring that compatibility is maintained even as new SDK updates are introduced. Customers can negotiate the details of platform support to align with their project requirements.
#### Tailored Support Duration
DocWire SDK understands that the definition of "long-term" can vary based on individual project timelines and needs. Therefore, LTS agreements come with the flexibility to negotiate the support duration, allowing customers to align the agreement with their project's lifecycle and development roadmap.
#### Ongoing Collaboration
An LTS agreement signifies an ongoing collaboration between DocWire SDK and the customer. It establishes a dedicated channel for communication, ensuring that any concerns or specific requirements related to stability, API, or platform support are addressed promptly. This collaborative approach reflects our commitment to supporting our users throughout their development journey.
#### How to Enquire about LTS
To explore the possibilities of an LTS agreement or to discuss specific requirements, please reach out to our dedicated support team. We are committed to working closely with our users to create customized LTS agreements that cater to the unique demands of their projects, providing a solid foundation for long-term success.
## Logging
DocWire SDK generate extensive logs that provide insights into the current processing status, warnings, and errors. In the latest version of the SDK, the logging mechanism has been enhanced to output logs in JSON format, offering several advantages.
The enhanced logging mechanism in the DocWire SDK provides developers with powerful tools for monitoring and debugging data processing. Whether redirecting logs to a custom stream or leveraging the flexibility of JSON formatting, the logging system is designed to meet the diverse needs of users.
### JSON Format for Logging
The logs are now formatted in JSON, providing a structured and machine-readable representation of the information. This format is advantageous for various reasons:
- **Structured Data**: JSON allows for a clear and organized representation of log data, making it easy to extract specific information.
- **Compatibility**: JSON is widely supported by various tools and platforms, ensuring compatibility and ease of integration into existing workflows.
- **Readability**: The human-readable nature of JSON logs facilitates manual inspection and troubleshooting when needed.
- **Flexibility**: JSON's key-value pair structure accommodates a wide range of log information, enhancing the flexibility of the logging system.
### Configuring Logging
To configure the logging parameters, the SDK provides a set of functions. Users can set the log verbosity level, customize the log stream, and create log record streams with specific severity levels and source locations.
### Log Macros
The SDK includes convenient macros for logging, such as:
- docwire_log(severity): Conditionally logs based on the specified severity level.
- docwire_log_vars(...): Logs variables with associated values.
- docwire_log_func_with_args(...): Logs function entry with associated arguments.
### Additional Logging Features
The SDK introduces new features like logging source locations, custom streamable types, and handling of iterable and dereferenceable objects.
## API Documentation
The API documentation for the DocWire SDK/library is readily available in various formats to assist you in seamlessly integrating it into your projects. Whether you prefer reading detailed doxygen-style documentation, accessing it through binary packages, or installing it via package managers like Vcpkg, we've got you covered.
### ReadTheDocs - Doxygen Format
Our API documentation is hosted on [ReadTheDocs](https://docwire.readthedocs.io/), presented in the widely recognized and developer-friendly Doxygen format. This comprehensive documentation provides insights into the functionality, usage, and features of the DocWire library. You can explore it at your own pace to better understand how to harness the power of DocWire within your applications.
### GitHub Releases
If you're looking for a more direct way to access the documentation, you can find it bundled with our binary packages in the [GitHub Releases](https://github.com/docwire/docwire/releases) section. Simply download the appropriate release for your platform, and you'll have the API documentation readily available alongside the library itself.
### Vcpkg Package Manager
For users who prefer to manage their dependencies using Vcpkg, we've made sure that our API documentation is included with the packages you install. This means you can access the same doxygen-style documentation seamlessly as you manage and integrate DocWire into your C++ projects.
### Consistency Across Platforms
It's worth noting that no matter where you choose to access our API documentation—whether through ReadTheDocs, GitHub Releases, or Vcpkg—you will find the same comprehensive doxygen-style documentation. This ensures a consistent and reliable resource for understanding and utilizing the DocWire library.
### Why ReadTheDocs?
You might wonder why we chose ReadTheDocs to host our documentation. While some might see this as a marketing signal, we believe it's a practical choice for several reasons:
1. **Accessibility**: ReadTheDocs provides an easy-to-navigate platform that ensures our documentation is readily accessible to all users.
2. **Versioning**: We can maintain multiple versions of our documentation, ensuring that you can always find the information relevant to your specific library version.
3. **Automation**: ReadTheDocs allows us to automate the documentation publishing process, ensuring that you have the latest documentation whenever you need it without delay.
We believe in making the integration of DocWire as smooth as possible, and providing our documentation through ReadTheDocs is just one way we're committed to simplifying your experience.
Explore the documentation, experiment with the library, and feel free to reach out if you have any questions or feedback. We're here to support you on your journey with DocWire.
## Error handling: robust and secure
The DocWire SDK provides a comprehensive error handling framework designed to handle errors in a clear and specific way. This framework offers several key features that facilitate error diagnosis and handling.
### Chained exceptions
The framework supports building an exception chain, allowing the creation of a longer error context across different layers of the backtrace. This enables tracking of the sequence of events leading up to an error, making it easier to identify the root cause.
In addition to std::nested_exception and std::exception_ptr and std::throw_with_nested, the framework also includes the [errors::make_nested](https://docwire.readthedocs.io/en/latest/namespacedocwire_1_1errors.html#afde521434d8ac75136d13bc9d42c17a2) and [errors::make_nested_ptr](https://docwire.readthedocs.io/en/latest/namespacedocwire_1_1errors.html#a7380b5b4f036f549a9c284261d9f31ac) function templates and [errors::nested](https://docwire.readthedocs.io/en/latest/classdocwire_1_1errors_1_1nested.html) class template for easy error chaining like: `docwire::errors::make_nested_ptr(error_object_1, error_object_2)`.
### Type-safe context values
Context values are type-safe, meaning any type, including custom types, can be used in the context value. The context is built from original C++ types, and context objects can be accessed during the error handling process.
[errors::impl](https://docwire.readthedocs.io/en/latest/structdocwire_1_1errors_1_1impl.html) structure template provides an implementation of the context value mechanism, including support for custom types: `throw errors::impl{custom_type_value}`.
[errors::base](https://docwire.readthedocs.io/en/latest/structdocwire_1_1errors_1_1base.html) structure provides a base class for all error types for easy error handling: `catch (docwire::errors::base& e) { ... }`.
### Embedded context variable names and triggering expressions
Error contexts include stringified context variable names and triggering C++ expressions, providing valuable information for diagnosing and handling errors via make_error and throw_if macros.
For example: `throw_if(x < 0 || y < 0, x, y)` gives, on failure, an exception chain with `std::make_pair("triggering_expression", "x < 0 || y < 0")`, `std::make_pair("x", x)` and `std::make_pair("y", y)`.
### Categorized and tagged errors
Errors can be tagged with a custom type or multiple types like [errors::network_failure](https://docwire.readthedocs.io/en/latest/structdocwire_1_1errors_1_1network__failure.html) or [errors::file_encrypted](https://docwire.readthedocs.io/en/latest/structdocwire_1_1errors_1_1file__encrypted.html), providing fine-grained control over error handling. Decisions, such as retrying an operation or asking for password, can be made based on the presence of a specific tag type in error chain: `catch (const docwire::errors::base& e) if (docwire::errors::contains_type(e)) { retry(); }`.
### Embedded source location
Error objects include embedded source location information, providing essential context for debugging: `catch (const docwire::errors::base& e) { std::cerr << e.source_location.file_name() << ":" << e.source_location.line() << std::endl; }`.
### Secure error messages
To reduce the risk of security breaches, the framework avoids including sensitive information in formatted error messages. There is no implicit stringification of context values, and the standard what() method is secured to return only the exception type name. Sensitive information can be retrieved from the error object on-demand only: `std::cerr << e.content_type() << ": " << e.context_string()` or `if (docwire::errors::contains_type(e)) { auto fn = dynamic_cast>(e).context; }`.
### Easy context data retrieval
Functions like [errors::diagnostic_message](https://docwire.readthedocs.io/en/latest/namespacedocwire_1_1errors.html#a2446f6c81b6a5b338dea00cb29c40263) and [errors::contains_type](https://docwire.readthedocs.io/en/latest/namespacedocwire_1_1errors.html#aa2af5b81a3e66772f19506d8e8a93184) allow access to context data in a controlled manner, facilitating effective error handling.
For example `std::cerr << docwire::errors::diagnostic_message(e) << std::endl;` can give the following results:
```
[ERROR] Error "file encrypted error tag"
in void docwire::XLSParser::Implementation::processRecord(int, const std::vector&, std::string&)
at /docwire/src/xls_parser.cpp:483
with context "RC4 encryption"
in void docwire::XLSParser::Implementation::processRecord(int, const std::vector&, std::string&)
at /docwire/src/xls_parser.cpp:483
with context "Error parsing XLS document"
in std::string docwire::XLSParser::parse(docwire::ThreadSafeOLEStorage&) const
at /docwire/src/xls_parser.cpp:931
processing file tests/password_protected.xls
```
### Non-fatal errors and warnings
Errors that are not fatal are represented with the same chained error objects, but instead of using C++ throw/catch mechanism, they are pushed to the parsing chain or returned via callbacks like other results.
This assumption gives them similar security and debugging capabilities to fatal errors and allows easy conversion between fatal and non-fatal errors on different backtrace levels (it usually cannot be decided in a point of failure) without breaking of the error chain.
[You can find example of handling non-fatal errors here](https://docwire.readthedocs.io/en/latest/handling_errors_and_warnings_8cpp-example.html)
### Modern C++ features used
The framework is designed to take advantage of modern C++ features, such as std::nested_exception and std::exception_ptr, to build an exception chain instead of adding multiple values to a single exception object. This approach ensures alignment with the latest C++ standards and best practices. Although the design shares similarities with Boost Exception, the DocWire SDK's error handling framework is more closely aligned with modern C++ standards.
## Console Application (CLI)
Welcome to the DocWire Console Application (DocWire CLI). This versatile command-line tool empowers users to extract content from documents, including text, document structure, and metadata. Whether you're processing documents for analysis, summarization, sentiment detection, or translation, DocWire CLI has you covered.
### Usage
To run the program and process a document, use the following command:
```bash
docwire [options] file_name
```
### Basic options
- **––help**: Display the help message.
- **––version**: Display the DocWire version.
- **––verbose**: Enable verbose logging.
- **––input-file **: Specify the path to the file to process (or you can provide filename without --input-file).
- **––output_type ** (default: plain_text): Set the output type. Available types include plain_text, html (preserving document structure), csv (structured data), and metadata (document information).
### Local AI Integration
Process data securely using offline AI models with the following options:
- **––local-ai-prompt **: prompt to process text via local AI model
- **––local-ai-model **: path to local AI model data (build-in default model is used if not specified)
### OpenAI Integration
Unlock the power of OpenAI with the following options:
- **––openai-chat **: Initiate a chat prompt for processing text and images via OpenAI.
- **––openai-extract-entities**: Extract entities from text and images via OpenAI.
- **––openai-extract-keywords **: Extract N keywords/key phrases from text and images via OpenAI.
- **––openai-summarize**: Summarize text and images via OpenAI.
- **––openai-detect-sentiment**: Detect sentiment of text and images via OpenAI.
- **––openai-analyze-data**: Analyze text and images for important insights and generate conclusions via OpenAI.
- **––openai-classify **: Classify text and images via OpenAI to one of the specified categories.
- **––openai-translate-to **: Language to translate text and images to via OpenAI.
- **––openai-find **: Find specified phrase, object or event in text and images via OpenAI.
- **––openai-text-to-speech**: Convert text to speech via OpenAI
- **––openai-transcribe**: Convert speech to text (transcribe) via OpenAI
- **––openai-key **: OpenAI API key.
- **––openai-model ** (default: gpt35_turbo): Choose the OpenAI model. Available models are: gpt35_turbo, gpt35_turbo_0125, gpt35_turbo_1106, gpt4, gpt4_0613, gpt4_32k, gpt4_32k_0613, gpt4_turbo_preview, gpt4_0125_preview, gpt4_1106_preview, gpt4_vision_preview and gpt4_1106_vision_preview.
- **––openai-temperature **: Force specified temperature for OpenAI prompts.
- **––openai-image-detail **: Force specified image detail parameter for OpenAI image prompts. Available options are: low, high and automatic.
- **––openai-tts-model ** (default: tts1): Choose the TTS model. Available models are: tts1, tts1_hd.
- **––openai-voice ** (default: alloy): Choose voice for text to speech conversion. Available voices are: alloy, echo, fable, onyx, nova, shimmer.
### Additional Options
- **––language (default: eng)**: Set the document language(s) for OCR as ISO 639-3 identifiers like: spa, fra, deu, rus, chi_sim, chi_tra etc. More than 100 languages are supported. Multiple languages can be enabled.
- **––use-stream (default: 0)**: Pass the file stream to the SDK instead of the filename.
- **––min_creation_time **: Filter emails by minimum creation time (currently applicable only to emails in PST/OST files).
- **––max_creation_time **: Filter emails by maximum creation time (currently applicable only to emails in PST/OST files).
- **––max_nodes_number **: Filter by the maximum number of nodes.
- **––folder_name **: Filter emails by folder name.
- **––attachment_extension **: Filter by attachment type.
- **––table-style